Skip to content

Bloom filters for i8 and i16 always return false negatives #5550

@progval

Description

@progval

Describe the bug

It is unclear to me if this is an issue when building or checking the Bloom filter; but either way, building a Bloom filter with i8 or i16 values (as opposed to i32 or i64) always returns false when checked.

To Reproduce

diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index 18c8617e07..d6b14e2899 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -2039,6 +2039,36 @@ mod tests {
         values_required::<BinaryArray, _>(many_vecs_iter);
     }
 
+    #[test]
+    fn i8_column_bloom_filter() {
+        let array = Arc::new(Int8Array::from_iter(0..SMALL_SIZE as i8));
+        let mut options = RoundTripOptions::new(array, false);
+        options.bloom_filter = true;
+
+        let files = one_column_roundtrip_with_options(options);
+        check_bloom_filter(
+            files,
+            "col".to_string(),
+            (0..SMALL_SIZE as i8).collect(),
+            (SMALL_SIZE as i8 + 1..SMALL_SIZE as i8 + 10).collect(),
+        );
+    }
+
+    #[test]
+    fn i16_column_bloom_filter() {
+        let array = Arc::new(Int16Array::from_iter(0..SMALL_SIZE as i16));
+        let mut options = RoundTripOptions::new(array, false);
+        options.bloom_filter = true;
+
+        let files = one_column_roundtrip_with_options(options);
+        check_bloom_filter(
+            files,
+            "col".to_string(),
+            (0..SMALL_SIZE as i16).collect(),
+            (SMALL_SIZE as i16 + 1..SMALL_SIZE as i16 + 10).collect(),
+        );
+    }
+
     #[test]
     fn i32_column_bloom_filter() {
         let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32));
@@ -2054,6 +2084,21 @@ mod tests {
         );
     }
 
+    #[test]
+    fn i64_column_bloom_filter() {
+        let array = Arc::new(Int64Array::from_iter(0..SMALL_SIZE as i64));
+        let mut options = RoundTripOptions::new(array, false);
+        options.bloom_filter = true;
+
+        let files = one_column_roundtrip_with_options(options);
+        check_bloom_filter(
+            files,
+            "col".to_string(),
+            (0..SMALL_SIZE as i64).collect(),
+            (SMALL_SIZE as i64 + 1..SMALL_SIZE as i64 + 10).collect(),
+        );
+    }
+
     #[test]
     fn binary_column_bloom_filter() {
         let one_vec: Vec<u8> = (0..SMALL_SIZE as u8).collect();

returns:

failures:

---- arrow::arrow_writer::tests::i16_column_bloom_filter stdout ----
thread 'arrow::arrow_writer::tests::i16_column_bloom_filter' panicked at parquet/src/arrow/arrow_writer/mod.rs:1792:17:
Value [0, 0] should be in bloom filter

---- arrow::arrow_writer::tests::i8_column_bloom_filter stdout ----
thread 'arrow::arrow_writer::tests::i8_column_bloom_filter' panicked at parquet/src/arrow/arrow_writer/mod.rs:1792:17:
Value [0] should be in bloom filter


failures:
    arrow::arrow_writer::tests::i16_column_bloom_filter
    arrow::arrow_writer::tests::i8_column_bloom_filter

Expected behavior
These tests should pass

Additional context

I found this from Datafusion: apache/datafusion#9779

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions