21
21
import os
22
22
from typing import Any , Iterable , Mapping , NoReturn , Optional
23
23
24
- from bson .binary import Binary
25
24
from bson .int64 import Int64
26
25
from bson .objectid import ObjectId
27
26
from gridfs .errors import CorruptGridFile , FileExists , NoFile
28
27
from pymongo import ASCENDING
29
28
from pymongo .client_session import ClientSession
30
29
from pymongo .collection import Collection
30
+ from pymongo .common import MAX_MESSAGE_SIZE
31
31
from pymongo .cursor import Cursor
32
32
from pymongo .errors import (
33
+ BulkWriteError ,
33
34
ConfigurationError ,
34
35
CursorNotFound ,
35
36
DuplicateKeyError ,
36
37
InvalidOperation ,
37
38
OperationFailure ,
38
39
)
40
+ from pymongo .helpers import _check_write_command_response
39
41
from pymongo .read_preferences import ReadPreference
40
42
41
43
_SEEK_SET = os .SEEK_SET
48
50
"""Default chunk size, in bytes."""
49
51
# Slightly under a power of 2, to work well with server's record allocations.
50
52
DEFAULT_CHUNK_SIZE = 255 * 1024
53
+ # The number of chunked bytes to buffer before calling insert_many.
54
+ _UPLOAD_BUFFER_SIZE = MAX_MESSAGE_SIZE
55
+ # The number of chunk documents to buffer before calling insert_many.
56
+ _UPLOAD_BUFFER_CHUNKS = 100000
57
+ # Rough BSON overhead of a chunk document not including the chunk data itself.
58
+ # Essentially len(encode({"_id": ObjectId(), "files_id": ObjectId(), "n": 1, "data": ""}))
59
+ _CHUNK_OVERHEAD = 60
51
60
52
61
_C_INDEX : dict [str , Any ] = {"files_id" : ASCENDING , "n" : ASCENDING }
53
62
_F_INDEX : dict [str , Any ] = {"filename" : ASCENDING , "uploadDate" : ASCENDING }
@@ -198,6 +207,8 @@ def __init__(
198
207
object .__setattr__ (self , "_chunk_number" , 0 )
199
208
object .__setattr__ (self , "_closed" , False )
200
209
object .__setattr__ (self , "_ensured_index" , False )
210
+ object .__setattr__ (self , "_buffered_docs" , [])
211
+ object .__setattr__ (self , "_buffered_docs_size" , 0 )
201
212
202
213
def __create_index (self , collection : Collection , index_key : Any , unique : bool ) -> None :
203
214
doc = collection .find_one (projection = {"_id" : 1 }, session = self ._session )
@@ -249,6 +260,8 @@ def closed(self) -> bool:
249
260
250
261
_buffer : io .BytesIO
251
262
_closed : bool
263
+ _buffered_docs : list [dict [str , Any ]]
264
+ _buffered_docs_size : int
252
265
253
266
def __getattr__ (self , name : str ) -> Any :
254
267
if name in self ._file :
@@ -268,32 +281,52 @@ def __setattr__(self, name: str, value: Any) -> None:
268
281
if self ._closed :
269
282
self ._coll .files .update_one ({"_id" : self ._file ["_id" ]}, {"$set" : {name : value }})
270
283
271
- def __flush_data (self , data : Any ) -> None :
284
+ def __flush_data (self , data : Any , force : bool = False ) -> None :
272
285
"""Flush `data` to a chunk."""
273
286
self .__ensure_indexes ()
274
- if not data :
275
- return
276
287
assert len (data ) <= self .chunk_size
277
-
278
- chunk = {"files_id" : self ._file ["_id" ], "n" : self ._chunk_number , "data" : Binary (data )}
279
-
280
- try :
281
- self ._chunks .insert_one (chunk , session = self ._session )
282
- except DuplicateKeyError :
283
- self ._raise_file_exists (self ._file ["_id" ])
288
+ if data :
289
+ self ._buffered_docs .append (
290
+ {"files_id" : self ._file ["_id" ], "n" : self ._chunk_number , "data" : data }
291
+ )
292
+ self ._buffered_docs_size += len (data ) + _CHUNK_OVERHEAD
293
+ if not self ._buffered_docs :
294
+ return
295
+ # Limit to 100,000 chunks or 32MB (+1 chunk) of data.
296
+ if (
297
+ force
298
+ or self ._buffered_docs_size >= _UPLOAD_BUFFER_SIZE
299
+ or len (self ._buffered_docs ) >= _UPLOAD_BUFFER_CHUNKS
300
+ ):
301
+ try :
302
+ self ._chunks .insert_many (self ._buffered_docs , session = self ._session )
303
+ except BulkWriteError as exc :
304
+ # For backwards compatibility, raise an insert_one style exception.
305
+ write_errors = exc .details ["writeErrors" ]
306
+ for err in write_errors :
307
+ if err .get ("code" ) in (11000 , 11001 , 12582 ): # Duplicate key errors
308
+ self ._raise_file_exists (self ._file ["_id" ])
309
+ result = {"writeErrors" : write_errors }
310
+ wces = exc .details ["writeConcernErrors" ]
311
+ if wces :
312
+ result ["writeConcernError" ] = wces [- 1 ]
313
+ _check_write_command_response (result )
314
+ raise
315
+ self ._buffered_docs = []
316
+ self ._buffered_docs_size = 0
284
317
self ._chunk_number += 1
285
318
self ._position += len (data )
286
319
287
- def __flush_buffer (self ) -> None :
320
+ def __flush_buffer (self , force : bool = False ) -> None :
288
321
"""Flush the buffer contents out to a chunk."""
289
- self .__flush_data (self ._buffer .getvalue ())
322
+ self .__flush_data (self ._buffer .getvalue (), force = force )
290
323
self ._buffer .close ()
291
324
self ._buffer = io .BytesIO ()
292
325
293
326
def __flush (self ) -> Any :
294
327
"""Flush the file to the database."""
295
328
try :
296
- self .__flush_buffer ()
329
+ self .__flush_buffer (force = True )
297
330
# The GridFS spec says length SHOULD be an Int64.
298
331
self ._file ["length" ] = Int64 (self ._position )
299
332
self ._file ["uploadDate" ] = datetime .datetime .now (tz = datetime .timezone .utc )
0 commit comments