15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- use std:: collections:: HashMap ;
18
+ use std:: collections:: { HashMap , HashSet } ;
19
+ use std:: ops:: Not ;
19
20
use std:: sync:: Arc ;
20
21
22
+ use arrow_array:: {
23
+ Array , ArrayRef , BooleanArray , Date32Array , Float32Array , Float64Array , Int32Array , Int64Array ,
24
+ StringArray , Time64MicrosecondArray , TimestampMicrosecondArray , TimestampNanosecondArray ,
25
+ } ;
21
26
use futures:: channel:: oneshot;
22
27
use futures:: future:: join_all;
23
28
use futures:: { StreamExt , TryStreamExt } ;
29
+ use itertools:: Itertools ;
24
30
use tokio:: sync:: oneshot:: { channel, Receiver } ;
25
31
26
32
use super :: delete_filter:: { DeleteFilter , EqDelFuture } ;
27
33
use crate :: arrow:: record_batch_transformer:: RecordBatchTransformer ;
28
- use crate :: arrow:: ArrowReader ;
34
+ use crate :: arrow:: { arrow_schema_to_schema , ArrowReader } ;
29
35
use crate :: delete_vector:: DeleteVector ;
30
- use crate :: expr:: Predicate ;
36
+ use crate :: expr:: Predicate :: AlwaysTrue ;
37
+ use crate :: expr:: { Predicate , Reference } ;
31
38
use crate :: io:: FileIO ;
32
39
use crate :: scan:: { ArrowRecordBatchStream , FileScanTaskDeleteFile } ;
33
- use crate :: spec:: { DataContentType , Schema , SchemaRef } ;
40
+ use crate :: spec:: { DataContentType , Datum , NestedFieldRef , PrimitiveType , Schema , SchemaRef } ;
34
41
use crate :: { Error , ErrorKind , Result } ;
35
42
36
43
#[ allow( unused) ]
@@ -75,6 +82,7 @@ enum DeleteFileContext {
75
82
FreshEqDel {
76
83
batch_stream : ArrowRecordBatchStream ,
77
84
sender : oneshot:: Sender < Predicate > ,
85
+ equality_ids : HashSet < i32 > ,
78
86
} ,
79
87
}
80
88
@@ -258,6 +266,7 @@ impl CachingDeleteFileLoader {
258
266
)
259
267
. await ?,
260
268
sender,
269
+ equality_ids : HashSet :: from_iter ( task. equality_ids . clone ( ) ) ,
261
270
} )
262
271
}
263
272
@@ -281,9 +290,11 @@ impl CachingDeleteFileLoader {
281
290
DeleteFileContext :: FreshEqDel {
282
291
sender,
283
292
batch_stream,
293
+ equality_ids,
284
294
} => {
285
295
let predicate =
286
- Self :: parse_equality_deletes_record_batch_stream ( batch_stream) . await ?;
296
+ Self :: parse_equality_deletes_record_batch_stream ( batch_stream, equality_ids)
297
+ . await ?;
287
298
288
299
sender
289
300
. send ( predicate)
@@ -361,14 +372,126 @@ impl CachingDeleteFileLoader {
361
372
///
362
373
/// Returns an unbound Predicate for each batch stream
363
374
async fn parse_equality_deletes_record_batch_stream (
364
- streams : ArrowRecordBatchStream ,
375
+ mut stream : ArrowRecordBatchStream ,
376
+ equality_ids : HashSet < i32 > ,
365
377
) -> Result < Predicate > {
366
- // TODO
378
+ let mut result_predicate = AlwaysTrue ;
367
379
368
- Err ( Error :: new (
369
- ErrorKind :: FeatureUnsupported ,
370
- "parsing of equality deletes is not yet supported" ,
371
- ) )
380
+ while let Some ( record_batch) = stream. next ( ) . await {
381
+ let record_batch = record_batch?;
382
+
383
+ if record_batch. num_columns ( ) == 0 {
384
+ return Ok ( AlwaysTrue ) ;
385
+ }
386
+
387
+ let batch_schema_arrow = record_batch. schema ( ) ;
388
+ let batch_schema_iceberg = arrow_schema_to_schema ( batch_schema_arrow. as_ref ( ) ) ?;
389
+
390
+ let mut datum_columns_with_names: Vec < _ > = record_batch
391
+ . columns ( )
392
+ . iter ( )
393
+ . zip ( batch_schema_iceberg. as_struct ( ) . fields ( ) )
394
+ // only use columns that are in the set of equality_ids for this delete file
395
+ . filter ( |( field, value) | equality_ids. contains ( & value. id ) )
396
+ . map ( |( column, field) | {
397
+ let col_as_datum_vec = arrow_array_to_datum_iterator ( column, field) ;
398
+ col_as_datum_vec. map ( |c| ( c, field. name . to_string ( ) ) )
399
+ } )
400
+ . try_collect ( ) ?;
401
+
402
+ // consume all the iterators in lockstep, creating per-row predicates that get combined
403
+ // into a single final predicate
404
+ while datum_columns_with_names[ 0 ] . 0 . len ( ) > 0 {
405
+ let mut row_predicate = AlwaysTrue ;
406
+ for ( ref mut column, ref field_name) in & mut datum_columns_with_names {
407
+ if let Some ( item) = column. next ( ) {
408
+ if let Some ( datum) = item? {
409
+ row_predicate = row_predicate
410
+ . and ( Reference :: new ( field_name. clone ( ) ) . equal_to ( datum. clone ( ) ) ) ;
411
+ }
412
+ }
413
+ }
414
+ result_predicate = result_predicate. and ( row_predicate. not ( ) ) ;
415
+ }
416
+ }
417
+ Ok ( result_predicate. rewrite_not ( ) )
418
+ }
419
+ }
420
+
421
+ macro_rules! prim_to_datum {
422
+ ( $column: ident, $arr: ty, $dat: path) => { {
423
+ let arr = $column. as_any( ) . downcast_ref:: <$arr>( ) . ok_or( Error :: new(
424
+ ErrorKind :: Unexpected ,
425
+ format!( "could not downcast ArrayRef to {}" , stringify!( $arr) ) ,
426
+ ) ) ?;
427
+ Ok ( Box :: new( arr. iter( ) . map( |val| Ok ( val. map( $dat) ) ) ) )
428
+ } } ;
429
+ }
430
+
431
+ fn eq_col_unsupported ( ty : & str ) -> Error {
432
+ Error :: new (
433
+ ErrorKind :: FeatureUnsupported ,
434
+ format ! (
435
+ "Equality deletes where a predicate acts upon a {} column are not yet supported" ,
436
+ ty
437
+ ) ,
438
+ )
439
+ }
440
+
441
+ fn arrow_array_to_datum_iterator < ' a > (
442
+ column : & ' a ArrayRef ,
443
+ field : & NestedFieldRef ,
444
+ ) -> Result < Box < dyn ExactSizeIterator < Item = Result < Option < Datum > > > + ' a > > {
445
+ match field. field_type . as_primitive_type ( ) {
446
+ Some ( primitive_type) => match primitive_type {
447
+ PrimitiveType :: Int => prim_to_datum ! ( column, Int32Array , Datum :: int) ,
448
+ PrimitiveType :: Boolean => {
449
+ prim_to_datum ! ( column, BooleanArray , Datum :: bool )
450
+ }
451
+ PrimitiveType :: Long => prim_to_datum ! ( column, Int64Array , Datum :: long) ,
452
+ PrimitiveType :: Float => {
453
+ prim_to_datum ! ( column, Float32Array , Datum :: float)
454
+ }
455
+ PrimitiveType :: Double => {
456
+ prim_to_datum ! ( column, Float64Array , Datum :: double)
457
+ }
458
+ PrimitiveType :: String => {
459
+ prim_to_datum ! ( column, StringArray , Datum :: string)
460
+ }
461
+ PrimitiveType :: Date => prim_to_datum ! ( column, Date32Array , Datum :: date) ,
462
+ PrimitiveType :: Timestamp => {
463
+ prim_to_datum ! ( column, TimestampMicrosecondArray , Datum :: timestamp_micros)
464
+ }
465
+ PrimitiveType :: Timestamptz => {
466
+ prim_to_datum ! ( column, TimestampMicrosecondArray , Datum :: timestamptz_micros)
467
+ }
468
+ PrimitiveType :: TimestampNs => {
469
+ prim_to_datum ! ( column, TimestampNanosecondArray , Datum :: timestamp_nanos)
470
+ }
471
+ PrimitiveType :: TimestamptzNs => {
472
+ prim_to_datum ! ( column, TimestampNanosecondArray , Datum :: timestamptz_nanos)
473
+ }
474
+ PrimitiveType :: Time => {
475
+ let arr = column
476
+ . as_any ( )
477
+ . downcast_ref :: < Time64MicrosecondArray > ( )
478
+ . ok_or ( Error :: new (
479
+ ErrorKind :: Unexpected ,
480
+ "could not downcast ArrayRef to Time64MicrosecondArray" ,
481
+ ) ) ?;
482
+ Ok ( Box :: new ( arr. iter ( ) . map ( |val| match val {
483
+ None => Ok ( None ) ,
484
+ Some ( val) => Datum :: time_micros ( val) . map ( Some ) ,
485
+ } ) ) )
486
+ }
487
+ PrimitiveType :: Decimal { .. } => Err ( eq_col_unsupported ( "Decimal" ) ) ,
488
+ PrimitiveType :: Uuid => Err ( eq_col_unsupported ( "Uuid" ) ) ,
489
+ PrimitiveType :: Fixed ( _) => Err ( eq_col_unsupported ( "Fixed" ) ) ,
490
+ PrimitiveType :: Binary => Err ( eq_col_unsupported ( "Binary" ) ) ,
491
+ } ,
492
+ None => Err ( eq_col_unsupported (
493
+ "non-primitive (i.e. Struct, List, or Map)" ,
494
+ ) ) ,
372
495
}
373
496
}
374
497
@@ -395,7 +518,7 @@ mod tests {
395
518
const FIELD_ID_POSITIONAL_DELETE_POS : u64 = 2147483545 ;
396
519
397
520
#[ tokio:: test]
398
- async fn test_delete_file_manager_load_deletes ( ) {
521
+ async fn test_delete_file_loader_load_deletes ( ) {
399
522
let tmp_dir = TempDir :: new ( ) . unwrap ( ) ;
400
523
let table_location = tmp_dir. path ( ) ;
401
524
let file_io = FileIO :: from_path ( table_location. as_os_str ( ) . to_str ( ) . unwrap ( ) )
@@ -405,19 +528,47 @@ mod tests {
405
528
406
529
// Note that with the delete file parsing not yet in place, all we can test here is that
407
530
// the call to the loader fails with the expected FeatureUnsupportedError.
408
- let delete_file_manager = CachingDeleteFileLoader :: new ( file_io. clone ( ) , 10 ) ;
531
+ let delete_file_loader = CachingDeleteFileLoader :: new ( file_io. clone ( ) , 10 ) ;
409
532
410
- let file_scan_tasks = setup ( table_location) ;
533
+ let file_scan_tasks = setup_load_deletes_test_tasks ( table_location) ;
411
534
412
- let result = delete_file_manager
535
+ let result = delete_file_loader
413
536
. load_deletes ( & file_scan_tasks[ 0 ] . deletes , file_scan_tasks[ 0 ] . schema_ref ( ) )
414
537
. await
415
538
. unwrap ( ) ;
416
539
417
540
assert ! ( result. is_err_and( |e| e. kind( ) == ErrorKind :: FeatureUnsupported ) ) ;
418
541
}
419
542
420
- fn setup ( table_location : & Path ) -> Vec < FileScanTask > {
543
+ #[ tokio:: test]
544
+ async fn test_delete_file_loader_parse_equality_deletes ( ) {
545
+ let tmp_dir = TempDir :: new ( ) . unwrap ( ) ;
546
+ let table_location = tmp_dir. path ( ) . as_os_str ( ) . to_str ( ) . unwrap ( ) ;
547
+ let file_io = FileIO :: from_path ( table_location) . unwrap ( ) . build ( ) . unwrap ( ) ;
548
+
549
+ let eq_delete_file_path = setup_write_equality_delete_file_1 ( table_location) ;
550
+
551
+ let record_batch_stream =
552
+ CachingDeleteFileLoader :: parquet_to_batch_stream ( & eq_delete_file_path, file_io. clone ( ) )
553
+ . await
554
+ . expect ( "could not get batch stream" ) ;
555
+
556
+ let eq_ids = HashSet :: from_iter ( vec ! [ 2 , 3 , 4 ] ) ;
557
+
558
+ let parsed_eq_delete = CachingDeleteFileLoader :: parse_equality_deletes_record_batch_stream (
559
+ record_batch_stream,
560
+ eq_ids,
561
+ )
562
+ . await
563
+ . expect ( "error parsing batch stream" ) ;
564
+ println ! ( "{}" , parsed_eq_delete) ;
565
+
566
+ let expected = "(((y != 1) OR (z != 100)) OR (a != \" HELP\" )) AND (y != 2)" . to_string ( ) ;
567
+
568
+ assert_eq ! ( parsed_eq_delete. to_string( ) , expected) ;
569
+ }
570
+
571
+ fn setup_load_deletes_test_tasks ( table_location : & Path ) -> Vec < FileScanTask > {
421
572
let data_file_schema = Arc :: new ( Schema :: builder ( ) . build ( ) . unwrap ( ) ) ;
422
573
let positional_delete_schema = create_pos_del_schema ( ) ;
423
574
@@ -527,4 +678,58 @@ mod tests {
527
678
] ;
528
679
Arc :: new ( arrow_schema:: Schema :: new ( fields) )
529
680
}
681
+
682
+ fn setup_write_equality_delete_file_1 ( table_location : & str ) -> String {
683
+ let col_y_vals = vec ! [ 1 , 2 ] ;
684
+ let col_y = Arc :: new ( Int64Array :: from ( col_y_vals) ) as ArrayRef ;
685
+
686
+ let col_z_vals = vec ! [ Some ( 100 ) , None ] ;
687
+ let col_z = Arc :: new ( Int64Array :: from ( col_z_vals) ) as ArrayRef ;
688
+
689
+ let col_a_vals = vec ! [ Some ( "HELP" ) , None ] ;
690
+ let col_a = Arc :: new ( StringArray :: from ( col_a_vals) ) as ArrayRef ;
691
+
692
+ let equality_delete_schema = {
693
+ let fields = vec ! [
694
+ arrow_schema:: Field :: new( "y" , arrow_schema:: DataType :: Int64 , true ) . with_metadata(
695
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "2" . to_string( ) ) ] ) ,
696
+ ) ,
697
+ arrow_schema:: Field :: new( "z" , arrow_schema:: DataType :: Int64 , true ) . with_metadata(
698
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "3" . to_string( ) ) ] ) ,
699
+ ) ,
700
+ arrow_schema:: Field :: new( "a" , arrow_schema:: DataType :: Utf8 , true ) . with_metadata(
701
+ HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "4" . to_string( ) ) ] ) ,
702
+ ) ,
703
+ ] ;
704
+ Arc :: new ( arrow_schema:: Schema :: new ( fields) )
705
+ } ;
706
+
707
+ let equality_deletes_to_write =
708
+ RecordBatch :: try_new ( equality_delete_schema. clone ( ) , vec ! [ col_y, col_z, col_a] )
709
+ . unwrap ( ) ;
710
+
711
+ let path = format ! ( "{}/equality-deletes-1.parquet" , & table_location) ;
712
+
713
+ let file = File :: create ( & path) . unwrap ( ) ;
714
+
715
+ let props = WriterProperties :: builder ( )
716
+ . set_compression ( Compression :: SNAPPY )
717
+ . build ( ) ;
718
+
719
+ let mut writer = ArrowWriter :: try_new (
720
+ file,
721
+ equality_deletes_to_write. schema ( ) ,
722
+ Some ( props. clone ( ) ) ,
723
+ )
724
+ . unwrap ( ) ;
725
+
726
+ writer
727
+ . write ( & equality_deletes_to_write)
728
+ . expect ( "Writing batch" ) ;
729
+
730
+ // writer must be closed to write footer
731
+ writer. close ( ) . unwrap ( ) ;
732
+
733
+ path
734
+ }
530
735
}
0 commit comments