@@ -1022,7 +1022,7 @@ def append(self, df: pa.Table) -> None:
1022
1022
with self .update_snapshot ().fast_append () as update_snapshot :
1023
1023
# skip writing data files if the dataframe is empty
1024
1024
if df .shape [0 ] > 0 :
1025
- data_files = _dataframe_to_data_files (self , df = df )
1025
+ data_files = _dataframe_to_data_files (self , write_uuid = update_snapshot . commit_uuid , df = df )
1026
1026
for data_file in data_files :
1027
1027
update_snapshot .append_data_file (data_file )
1028
1028
@@ -1052,7 +1052,7 @@ def overwrite(self, df: pa.Table, overwrite_filter: BooleanExpression = ALWAYS_T
1052
1052
with self .update_snapshot ().overwrite () as update_snapshot :
1053
1053
# skip writing data files if the dataframe is empty
1054
1054
if df .shape [0 ] > 0 :
1055
- data_files = _dataframe_to_data_files (self , df = df )
1055
+ data_files = _dataframe_to_data_files (self , write_uuid = update_snapshot . commit_uuid , df = df )
1056
1056
for data_file in data_files :
1057
1057
update_snapshot .append_data_file (data_file )
1058
1058
@@ -2349,7 +2349,9 @@ def _generate_manifest_list_path(location: str, snapshot_id: int, attempt: int,
2349
2349
return f'{ location } /metadata/snap-{ snapshot_id } -{ attempt } -{ commit_uuid } .avro'
2350
2350
2351
2351
2352
- def _dataframe_to_data_files (table : Table , df : pa .Table , file_schema : Optional [Schema ] = None ) -> Iterable [DataFile ]:
2352
+ def _dataframe_to_data_files (
2353
+ table : Table , df : pa .Table , write_uuid : Optional [uuid .UUID ] = None , file_schema : Optional [Schema ] = None
2354
+ ) -> Iterable [DataFile ]:
2353
2355
"""Convert a PyArrow table into a DataFile.
2354
2356
2355
2357
Returns:
@@ -2360,31 +2362,37 @@ def _dataframe_to_data_files(table: Table, df: pa.Table, file_schema: Optional[S
2360
2362
if len (table .spec ().fields ) > 0 :
2361
2363
raise ValueError ("Cannot write to partitioned tables" )
2362
2364
2363
- write_uuid = uuid .uuid4 ()
2364
2365
counter = itertools .count (0 )
2366
+ write_uuid = write_uuid or uuid .uuid4 ()
2365
2367
2366
2368
# This is an iter, so we don't have to materialize everything every time
2367
2369
# This will be more relevant when we start doing partitioned writes
2368
2370
yield from write_file (table , iter ([WriteTask (write_uuid , next (counter ), df )]), file_schema = file_schema )
2369
2371
2370
2372
2371
2373
class _MergingSnapshotProducer :
2374
+ commit_uuid : uuid .UUID
2372
2375
_operation : Operation
2373
2376
_table : Table
2374
2377
_snapshot_id : int
2375
2378
_parent_snapshot_id : Optional [int ]
2376
2379
_added_data_files : List [DataFile ]
2377
- _commit_uuid : uuid .UUID
2378
2380
_transaction : Optional [Transaction ]
2379
2381
2380
- def __init__ (self , operation : Operation , table : Table , transaction : Optional [Transaction ] = None ) -> None :
2382
+ def __init__ (
2383
+ self ,
2384
+ operation : Operation ,
2385
+ table : Table ,
2386
+ commit_uuid : Optional [uuid .UUID ] = None ,
2387
+ transaction : Optional [Transaction ] = None ,
2388
+ ) -> None :
2389
+ self .commit_uuid = commit_uuid or uuid .uuid4 ()
2381
2390
self ._operation = operation
2382
2391
self ._table = table
2383
2392
self ._snapshot_id = table .new_snapshot_id ()
2384
2393
# Since we only support the main branch for now
2385
2394
self ._parent_snapshot_id = snapshot .snapshot_id if (snapshot := self ._table .current_snapshot ()) else None
2386
2395
self ._added_data_files = []
2387
- self ._commit_uuid = uuid .uuid4 ()
2388
2396
self ._transaction = transaction
2389
2397
2390
2398
def __enter__ (self ) -> _MergingSnapshotProducer :
@@ -2408,7 +2416,7 @@ def _existing_manifests(self) -> List[ManifestFile]: ...
2408
2416
def _manifests (self ) -> List [ManifestFile ]:
2409
2417
def _write_added_manifest () -> List [ManifestFile ]:
2410
2418
if self ._added_data_files :
2411
- output_file_location = _new_manifest_path (location = self ._table .location (), num = 0 , commit_uuid = self ._commit_uuid )
2419
+ output_file_location = _new_manifest_path (location = self ._table .location (), num = 0 , commit_uuid = self .commit_uuid )
2412
2420
with write_manifest (
2413
2421
format_version = self ._table .format_version ,
2414
2422
spec = self ._table .spec (),
@@ -2434,7 +2442,8 @@ def _write_delete_manifest() -> List[ManifestFile]:
2434
2442
# Check if we need to mark the files as deleted
2435
2443
deleted_entries = self ._deleted_entries ()
2436
2444
if len (deleted_entries ) > 0 :
2437
- output_file_location = _new_manifest_path (location = self ._table .location (), num = 1 , commit_uuid = self ._commit_uuid )
2445
+ output_file_location = _new_manifest_path (location = self ._table .location (), num = 1 , commit_uuid = self .commit_uuid )
2446
+
2438
2447
with write_manifest (
2439
2448
format_version = self ._table .format_version ,
2440
2449
spec = self ._table .spec (),
@@ -2477,7 +2486,7 @@ def commit(self) -> Snapshot:
2477
2486
summary = self ._summary ()
2478
2487
2479
2488
manifest_list_file_path = _generate_manifest_list_path (
2480
- location = self ._table .location (), snapshot_id = self ._snapshot_id , attempt = 0 , commit_uuid = self ._commit_uuid
2489
+ location = self ._table .location (), snapshot_id = self ._snapshot_id , attempt = 0 , commit_uuid = self .commit_uuid
2481
2490
)
2482
2491
with write_manifest_list (
2483
2492
format_version = self ._table .metadata .format_version ,
0 commit comments