From 1ceeae988c0acffe7e1edf65bc2817ce6d9b2b1e Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Fri, 8 Aug 2025 15:38:23 +0800 Subject: [PATCH 1/9] Implement support for AllNull state in VariantArray and related output handling Signed-off-by: codephage2020 --- parquet-variant-compute/src/variant_array.rs | 80 ++++++++++++++++--- .../src/variant_get/mod.rs | 73 +++++++++++++++++ .../src/variant_get/output/mod.rs | 7 ++ .../src/variant_get/output/primitive.rs | 14 ++++ .../src/variant_get/output/variant.rs | 13 +++ 5 files changed, 175 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index d51df550622d..800e6dfb9aff 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -187,6 +187,7 @@ impl VariantArray { typed_value_to_variant(typed_value, index) } } + ShreddingState::AllNull { .. } => Variant::Null, } } @@ -226,8 +227,6 @@ impl VariantArray { /// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding #[derive(Debug)] pub enum ShreddingState { - // TODO: add missing state where there is neither value nor typed_value - // Missing { metadata: BinaryViewArray }, /// This variant has no typed_value field Unshredded { metadata: BinaryViewArray, @@ -250,6 +249,8 @@ pub enum ShreddingState { value: BinaryViewArray, typed_value: ArrayRef, }, + /// All values are null, only metadata is present + AllNull { metadata: BinaryViewArray }, } impl ShreddingState { @@ -270,9 +271,7 @@ impl ShreddingState { metadata, typed_value, }), - (_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from( - "VariantArray has neither value nor typed_value field", - ))), + (metadata, None, None) => Ok(Self::AllNull { metadata }), } } @@ -282,6 +281,7 @@ impl ShreddingState { ShreddingState::Unshredded { metadata, .. } => metadata, ShreddingState::Typed { metadata, .. } => metadata, ShreddingState::PartiallyShredded { metadata, .. } => metadata, + ShreddingState::AllNull { metadata } => metadata, } } @@ -291,6 +291,7 @@ impl ShreddingState { ShreddingState::Unshredded { value, .. } => Some(value), ShreddingState::Typed { .. } => None, ShreddingState::PartiallyShredded { value, .. } => Some(value), + ShreddingState::AllNull { .. } => None, } } @@ -300,6 +301,7 @@ impl ShreddingState { ShreddingState::Unshredded { .. } => None, ShreddingState::Typed { typed_value, .. } => Some(typed_value), ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value), + ShreddingState::AllNull { .. } => None, } } @@ -326,6 +328,9 @@ impl ShreddingState { value: value.slice(offset, length), typed_value: typed_value.slice(offset, length), }, + ShreddingState::AllNull { metadata } => ShreddingState::AllNull { + metadata: metadata.slice(offset, length), + }, } } } @@ -434,15 +439,17 @@ mod test { } #[test] - fn invalid_missing_value() { + fn all_null_missing_value_and_typed_value() { let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); let array = StructArray::new(fields, vec![make_binary_view_array()], None); - // Should fail because the StructArray does not contain a 'value' field - let err = VariantArray::try_new(Arc::new(array)); - assert_eq!( - err.unwrap_err().to_string(), - "Invalid argument error: VariantArray has neither value nor typed_value field" - ); + // Should succeed and create an AllNull variant when neither value nor typed_value are present + let variant_array = VariantArray::try_new(Arc::new(array)).unwrap(); + + // Verify the shredding state is AllNull + assert!(matches!( + variant_array.shredding_state(), + ShreddingState::AllNull { .. } + )); } #[test] @@ -488,4 +495,53 @@ mod test { fn make_binary_array() -> ArrayRef { Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) } + + #[test] + fn all_null_shredding_state() { + let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]); + let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap(); + + assert!(matches!( + shredding_state, + ShreddingState::AllNull { .. } + )); + + // Verify metadata is preserved correctly + if let ShreddingState::AllNull { metadata: m } = shredding_state { + assert_eq!(m.len(), metadata.len()); + assert_eq!(m.value(0), metadata.value(0)); + } + } + + #[test] + fn all_null_variant_array_construction() { + let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]); + let nulls = NullBuffer::from(vec![false, false, false]); // all null + + let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); + let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls)); + + let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + + // Verify the shredding state is AllNull + assert!(matches!( + variant_array.shredding_state(), + ShreddingState::AllNull { .. } + )); + + // Verify all values are null + assert_eq!(variant_array.len(), 3); + assert!(!variant_array.is_valid(0)); + assert!(!variant_array.is_valid(1)); + assert!(!variant_array.is_valid(2)); + + // Verify that value() returns Variant::Null for all indices + for i in 0..variant_array.len() { + assert!( + !variant_array.is_valid(i), + "Expected value at index {} to be null", + i + ); + } + } } diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs index cc852bbc32a2..694000279551 100644 --- a/parquet-variant-compute/src/variant_get/mod.rs +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -58,6 +58,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { ShreddingState::Unshredded { metadata, value } => { output_builder.unshredded(variant_array, metadata, value) } + ShreddingState::AllNull { metadata } => output_builder.all_null(variant_array, metadata), } } @@ -284,6 +285,40 @@ mod test { assert_eq!(&result, &expected) } + /// AllNull: extract a value as a VariantArray + #[test] + fn get_variant_all_null_as_variant() { + let array = all_null_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 3); + + // All values should be null + assert!(!result.is_valid(0)); + assert!(!result.is_valid(1)); + assert!(!result.is_valid(2)); + } + + /// AllNull: extract a value as an Int32Array + #[test] + fn get_variant_all_null_as_int32() { + let array = all_null_variant_array(); + // specify we want the typed value as Int32 + let field = Field::new("typed_value", DataType::Int32, true); + let options = GetOptions::new().with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Option::::None, + Option::::None, + Option::::None, + ])); + assert_eq!(&result, &expected) + } + /// Return a VariantArray that represents a perfectly "shredded" variant /// for the following example (3 Variant::Int32 values): /// @@ -427,4 +462,42 @@ mod test { StructArray::new(Fields::from(fields), arrays, nulls) } } + + /// Return a VariantArray that represents an "all null" variant + /// for the following example (3 null values): + /// + /// ```text + /// null + /// null + /// null + /// ``` + /// + /// The schema of the corresponding `StructArray` would look like this: + /// + /// ```text + /// StructArray { + /// metadata: BinaryViewArray, + /// } + /// ``` + fn all_null_variant_array() -> ArrayRef { + let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; + + let nulls = NullBuffer::from(vec![ + false, // row 0 is null + false, // row 1 is null + false, // row 2 is null + ]); + + // metadata is the same for all rows (though they're all null) + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata)) + .with_nulls(nulls) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } } diff --git a/parquet-variant-compute/src/variant_get/output/mod.rs b/parquet-variant-compute/src/variant_get/output/mod.rs index 245d73cce8db..52a8f5bc0288 100644 --- a/parquet-variant-compute/src/variant_get/output/mod.rs +++ b/parquet-variant-compute/src/variant_get/output/mod.rs @@ -58,6 +58,13 @@ pub(crate) trait OutputBuilder { metadata: &BinaryViewArray, value_field: &BinaryViewArray, ) -> Result; + + /// write out an all-null variant array + fn all_null( + &self, + variant_array: &VariantArray, + metadata: &BinaryViewArray, + ) -> Result; } pub(crate) fn instantiate_output_builder<'a>( diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs index 36e4221e3242..7f737b6ecd60 100644 --- a/parquet-variant-compute/src/variant_get/output/primitive.rs +++ b/parquet-variant-compute/src/variant_get/output/primitive.rs @@ -23,6 +23,7 @@ use arrow::array::{ Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, PrimitiveArray, }; +use arrow::buffer::NullBuffer; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::Int32Type; use arrow_schema::{ArrowError, FieldRef}; @@ -155,6 +156,19 @@ impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, "variant_get unshredded to primitive types is not implemented yet", ))) } + + fn all_null( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + ) -> Result { + // For all-null case, create a primitive array with all null values + let nulls = NullBuffer::from(vec![false; variant_array.len()]); + let values = vec![T::default_value(); variant_array.len()]; + let array = PrimitiveArray::::new(values.into(), Some(nulls)) + .with_data_type(self.as_type.data_type().clone()); + Ok(Arc::new(array)) + } } impl ArrowPrimitiveVariant for Int32Type { diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs index 2c04111a5306..3dcc319ecfb1 100644 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -143,4 +143,17 @@ impl<'a> OutputBuilder for VariantOutputBuilder<'a> { Ok(Arc::new(builder.build())) } + + fn all_null( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + ) -> arrow::error::Result { + // For all-null case, simply create a VariantArray with all null values + let mut builder = VariantArrayBuilder::new(variant_array.len()); + for _i in 0..variant_array.len() { + builder.append_null(); + } + Ok(Arc::new(builder.build())) + } } From bd43fbf692c063cde98144fbd254bc467c16eb92 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Fri, 8 Aug 2025 16:01:02 +0800 Subject: [PATCH 2/9] fmt Signed-off-by: codephage2020 --- parquet-variant-compute/src/variant_array.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 800e6dfb9aff..09310d9c2c63 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -501,11 +501,8 @@ mod test { let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]); let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap(); - assert!(matches!( - shredding_state, - ShreddingState::AllNull { .. } - )); - + assert!(matches!(shredding_state, ShreddingState::AllNull { .. })); + // Verify metadata is preserved correctly if let ShreddingState::AllNull { metadata: m } = shredding_state { assert_eq!(m.len(), metadata.len()); From 1120b076f2a57df10a0031bdb4516952700a5f1d Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Fri, 8 Aug 2025 15:38:23 +0800 Subject: [PATCH 3/9] Implement support for AllNull state in VariantArray and related output handling Signed-off-by: codephage2020 --- parquet-variant-compute/src/variant_array.rs | 80 ++++++++++++++++--- .../src/variant_get/mod.rs | 73 +++++++++++++++++ .../src/variant_get/output/mod.rs | 7 ++ .../src/variant_get/output/primitive.rs | 14 ++++ .../src/variant_get/output/variant.rs | 13 +++ 5 files changed, 175 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index d51df550622d..800e6dfb9aff 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -187,6 +187,7 @@ impl VariantArray { typed_value_to_variant(typed_value, index) } } + ShreddingState::AllNull { .. } => Variant::Null, } } @@ -226,8 +227,6 @@ impl VariantArray { /// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding #[derive(Debug)] pub enum ShreddingState { - // TODO: add missing state where there is neither value nor typed_value - // Missing { metadata: BinaryViewArray }, /// This variant has no typed_value field Unshredded { metadata: BinaryViewArray, @@ -250,6 +249,8 @@ pub enum ShreddingState { value: BinaryViewArray, typed_value: ArrayRef, }, + /// All values are null, only metadata is present + AllNull { metadata: BinaryViewArray }, } impl ShreddingState { @@ -270,9 +271,7 @@ impl ShreddingState { metadata, typed_value, }), - (_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from( - "VariantArray has neither value nor typed_value field", - ))), + (metadata, None, None) => Ok(Self::AllNull { metadata }), } } @@ -282,6 +281,7 @@ impl ShreddingState { ShreddingState::Unshredded { metadata, .. } => metadata, ShreddingState::Typed { metadata, .. } => metadata, ShreddingState::PartiallyShredded { metadata, .. } => metadata, + ShreddingState::AllNull { metadata } => metadata, } } @@ -291,6 +291,7 @@ impl ShreddingState { ShreddingState::Unshredded { value, .. } => Some(value), ShreddingState::Typed { .. } => None, ShreddingState::PartiallyShredded { value, .. } => Some(value), + ShreddingState::AllNull { .. } => None, } } @@ -300,6 +301,7 @@ impl ShreddingState { ShreddingState::Unshredded { .. } => None, ShreddingState::Typed { typed_value, .. } => Some(typed_value), ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value), + ShreddingState::AllNull { .. } => None, } } @@ -326,6 +328,9 @@ impl ShreddingState { value: value.slice(offset, length), typed_value: typed_value.slice(offset, length), }, + ShreddingState::AllNull { metadata } => ShreddingState::AllNull { + metadata: metadata.slice(offset, length), + }, } } } @@ -434,15 +439,17 @@ mod test { } #[test] - fn invalid_missing_value() { + fn all_null_missing_value_and_typed_value() { let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); let array = StructArray::new(fields, vec![make_binary_view_array()], None); - // Should fail because the StructArray does not contain a 'value' field - let err = VariantArray::try_new(Arc::new(array)); - assert_eq!( - err.unwrap_err().to_string(), - "Invalid argument error: VariantArray has neither value nor typed_value field" - ); + // Should succeed and create an AllNull variant when neither value nor typed_value are present + let variant_array = VariantArray::try_new(Arc::new(array)).unwrap(); + + // Verify the shredding state is AllNull + assert!(matches!( + variant_array.shredding_state(), + ShreddingState::AllNull { .. } + )); } #[test] @@ -488,4 +495,53 @@ mod test { fn make_binary_array() -> ArrayRef { Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) } + + #[test] + fn all_null_shredding_state() { + let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]); + let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap(); + + assert!(matches!( + shredding_state, + ShreddingState::AllNull { .. } + )); + + // Verify metadata is preserved correctly + if let ShreddingState::AllNull { metadata: m } = shredding_state { + assert_eq!(m.len(), metadata.len()); + assert_eq!(m.value(0), metadata.value(0)); + } + } + + #[test] + fn all_null_variant_array_construction() { + let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]); + let nulls = NullBuffer::from(vec![false, false, false]); // all null + + let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); + let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls)); + + let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + + // Verify the shredding state is AllNull + assert!(matches!( + variant_array.shredding_state(), + ShreddingState::AllNull { .. } + )); + + // Verify all values are null + assert_eq!(variant_array.len(), 3); + assert!(!variant_array.is_valid(0)); + assert!(!variant_array.is_valid(1)); + assert!(!variant_array.is_valid(2)); + + // Verify that value() returns Variant::Null for all indices + for i in 0..variant_array.len() { + assert!( + !variant_array.is_valid(i), + "Expected value at index {} to be null", + i + ); + } + } } diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs index cc852bbc32a2..694000279551 100644 --- a/parquet-variant-compute/src/variant_get/mod.rs +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -58,6 +58,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { ShreddingState::Unshredded { metadata, value } => { output_builder.unshredded(variant_array, metadata, value) } + ShreddingState::AllNull { metadata } => output_builder.all_null(variant_array, metadata), } } @@ -284,6 +285,40 @@ mod test { assert_eq!(&result, &expected) } + /// AllNull: extract a value as a VariantArray + #[test] + fn get_variant_all_null_as_variant() { + let array = all_null_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 3); + + // All values should be null + assert!(!result.is_valid(0)); + assert!(!result.is_valid(1)); + assert!(!result.is_valid(2)); + } + + /// AllNull: extract a value as an Int32Array + #[test] + fn get_variant_all_null_as_int32() { + let array = all_null_variant_array(); + // specify we want the typed value as Int32 + let field = Field::new("typed_value", DataType::Int32, true); + let options = GetOptions::new().with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Option::::None, + Option::::None, + Option::::None, + ])); + assert_eq!(&result, &expected) + } + /// Return a VariantArray that represents a perfectly "shredded" variant /// for the following example (3 Variant::Int32 values): /// @@ -427,4 +462,42 @@ mod test { StructArray::new(Fields::from(fields), arrays, nulls) } } + + /// Return a VariantArray that represents an "all null" variant + /// for the following example (3 null values): + /// + /// ```text + /// null + /// null + /// null + /// ``` + /// + /// The schema of the corresponding `StructArray` would look like this: + /// + /// ```text + /// StructArray { + /// metadata: BinaryViewArray, + /// } + /// ``` + fn all_null_variant_array() -> ArrayRef { + let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; + + let nulls = NullBuffer::from(vec![ + false, // row 0 is null + false, // row 1 is null + false, // row 2 is null + ]); + + // metadata is the same for all rows (though they're all null) + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata)) + .with_nulls(nulls) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } } diff --git a/parquet-variant-compute/src/variant_get/output/mod.rs b/parquet-variant-compute/src/variant_get/output/mod.rs index 245d73cce8db..52a8f5bc0288 100644 --- a/parquet-variant-compute/src/variant_get/output/mod.rs +++ b/parquet-variant-compute/src/variant_get/output/mod.rs @@ -58,6 +58,13 @@ pub(crate) trait OutputBuilder { metadata: &BinaryViewArray, value_field: &BinaryViewArray, ) -> Result; + + /// write out an all-null variant array + fn all_null( + &self, + variant_array: &VariantArray, + metadata: &BinaryViewArray, + ) -> Result; } pub(crate) fn instantiate_output_builder<'a>( diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs index 36e4221e3242..7f737b6ecd60 100644 --- a/parquet-variant-compute/src/variant_get/output/primitive.rs +++ b/parquet-variant-compute/src/variant_get/output/primitive.rs @@ -23,6 +23,7 @@ use arrow::array::{ Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, PrimitiveArray, }; +use arrow::buffer::NullBuffer; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::Int32Type; use arrow_schema::{ArrowError, FieldRef}; @@ -155,6 +156,19 @@ impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, "variant_get unshredded to primitive types is not implemented yet", ))) } + + fn all_null( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + ) -> Result { + // For all-null case, create a primitive array with all null values + let nulls = NullBuffer::from(vec![false; variant_array.len()]); + let values = vec![T::default_value(); variant_array.len()]; + let array = PrimitiveArray::::new(values.into(), Some(nulls)) + .with_data_type(self.as_type.data_type().clone()); + Ok(Arc::new(array)) + } } impl ArrowPrimitiveVariant for Int32Type { diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs index 2c04111a5306..3dcc319ecfb1 100644 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -143,4 +143,17 @@ impl<'a> OutputBuilder for VariantOutputBuilder<'a> { Ok(Arc::new(builder.build())) } + + fn all_null( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + ) -> arrow::error::Result { + // For all-null case, simply create a VariantArray with all null values + let mut builder = VariantArrayBuilder::new(variant_array.len()); + for _i in 0..variant_array.len() { + builder.append_null(); + } + Ok(Arc::new(builder.build())) + } } From b5d4c5d4c1bb83425edad65934502fb6fbaefe90 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Fri, 8 Aug 2025 16:01:02 +0800 Subject: [PATCH 4/9] fmt Signed-off-by: codephage2020 --- parquet-variant-compute/src/variant_array.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 800e6dfb9aff..09310d9c2c63 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -501,11 +501,8 @@ mod test { let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]); let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap(); - assert!(matches!( - shredding_state, - ShreddingState::AllNull { .. } - )); - + assert!(matches!(shredding_state, ShreddingState::AllNull { .. })); + // Verify metadata is preserved correctly if let ShreddingState::AllNull { metadata: m } = shredding_state { assert_eq!(m.len(), metadata.len()); From b237cb12b006375d24e7f1f1fbf7f75d679b56a0 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Tue, 12 Aug 2025 22:34:43 +0800 Subject: [PATCH 5/9] Add test for unshredded state when value field is present but all null Signed-off-by: codephage2020 --- parquet-variant-compute/src/variant_array.rs | 39 +++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 09310d9c2c63..e0f4de5dbccf 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -536,9 +536,44 @@ mod test { for i in 0..variant_array.len() { assert!( !variant_array.is_valid(i), - "Expected value at index {} to be null", - i + "Expected value at index {i} to be null" ); } } + + #[test] + fn value_field_present_but_all_null_should_be_unshredded() { + // This test demonstrates the issue: when a value field exists in schema + // but all its values are null, it should remain Unshredded, not AllNull + let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]); + + // Create a value field with all null values + let value_nulls = NullBuffer::from(vec![false, false, false]); // all null + let value_array = BinaryViewArray::from_iter_values(vec![""; 3]); + let value_data = value_array + .to_data() + .into_builder() + .nulls(Some(value_nulls)) + .build() + .unwrap(); + let value = BinaryViewArray::from(value_data); + + let fields = Fields::from(vec![ + Field::new("metadata", DataType::BinaryView, false), + Field::new("value", DataType::BinaryView, true), // Field exists in schema + ]); + let struct_array = StructArray::new( + fields, + vec![Arc::new(metadata), Arc::new(value)], + None, // struct itself is not null, just the value field is all null + ); + + let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + + // This should be Unshredded, not AllNull, because value field exists in schema + assert!(matches!( + variant_array.shredding_state(), + ShreddingState::Unshredded { .. } + )); + } } From d266031f1a8047578858f3892e91b9e6c1c68456 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Tue, 12 Aug 2025 22:45:46 +0800 Subject: [PATCH 6/9] Delete duplicates Signed-off-by: codephage2020 --- parquet-variant-compute/src/variant_array.rs | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index a03d8d66c77b..e0f4de5dbccf 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -188,7 +188,6 @@ impl VariantArray { } } ShreddingState::AllNull { .. } => Variant::Null, - ShreddingState::AllNull { .. } => Variant::Null, } } @@ -252,8 +251,6 @@ pub enum ShreddingState { }, /// All values are null, only metadata is present AllNull { metadata: BinaryViewArray }, - /// All values are null, only metadata is present - AllNull { metadata: BinaryViewArray }, } impl ShreddingState { @@ -275,7 +272,6 @@ impl ShreddingState { typed_value, }), (metadata, None, None) => Ok(Self::AllNull { metadata }), - (metadata, None, None) => Ok(Self::AllNull { metadata }), } } @@ -286,7 +282,6 @@ impl ShreddingState { ShreddingState::Typed { metadata, .. } => metadata, ShreddingState::PartiallyShredded { metadata, .. } => metadata, ShreddingState::AllNull { metadata } => metadata, - ShreddingState::AllNull { metadata } => metadata, } } @@ -297,7 +292,6 @@ impl ShreddingState { ShreddingState::Typed { .. } => None, ShreddingState::PartiallyShredded { value, .. } => Some(value), ShreddingState::AllNull { .. } => None, - ShreddingState::AllNull { .. } => None, } } @@ -308,7 +302,6 @@ impl ShreddingState { ShreddingState::Typed { typed_value, .. } => Some(typed_value), ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value), ShreddingState::AllNull { .. } => None, - ShreddingState::AllNull { .. } => None, } } @@ -338,9 +331,6 @@ impl ShreddingState { ShreddingState::AllNull { metadata } => ShreddingState::AllNull { metadata: metadata.slice(offset, length), }, - ShreddingState::AllNull { metadata } => ShreddingState::AllNull { - metadata: metadata.slice(offset, length), - }, } } } @@ -449,21 +439,12 @@ mod test { } #[test] - fn all_null_missing_value_and_typed_value() { fn all_null_missing_value_and_typed_value() { let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); let array = StructArray::new(fields, vec![make_binary_view_array()], None); // Should succeed and create an AllNull variant when neither value nor typed_value are present let variant_array = VariantArray::try_new(Arc::new(array)).unwrap(); - // Verify the shredding state is AllNull - assert!(matches!( - variant_array.shredding_state(), - ShreddingState::AllNull { .. } - )); - // Should succeed and create an AllNull variant when neither value nor typed_value are present - let variant_array = VariantArray::try_new(Arc::new(array)).unwrap(); - // Verify the shredding state is AllNull assert!(matches!( variant_array.shredding_state(), From 4b52a03f578ff6de083349c89d7391726951347a Mon Sep 17 00:00:00 2001 From: Yan Tingwang Date: Wed, 13 Aug 2025 15:19:05 +0800 Subject: [PATCH 7/9] Update parquet-variant-compute/src/variant_get/output/primitive.rs Co-authored-by: Ryan Johnson Co-authored-by: Ryan Johnson --- parquet-variant-compute/src/variant_get/output/primitive.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs index 7f737b6ecd60..078120aef0ba 100644 --- a/parquet-variant-compute/src/variant_get/output/primitive.rs +++ b/parquet-variant-compute/src/variant_get/output/primitive.rs @@ -163,11 +163,7 @@ impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, _metadata: &BinaryViewArray, ) -> Result { // For all-null case, create a primitive array with all null values - let nulls = NullBuffer::from(vec![false; variant_array.len()]); - let values = vec![T::default_value(); variant_array.len()]; - let array = PrimitiveArray::::new(values.into(), Some(nulls)) - .with_data_type(self.as_type.data_type().clone()); - Ok(Arc::new(array)) + Ok(Arc::new(new_null_array(self.as_type.data_type(), variant_array.len()))) } } From b16c95ba28c767690cb14258fdfe715ab71f9984 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Wed, 13 Aug 2025 15:42:03 +0800 Subject: [PATCH 8/9] fmt and clippy Signed-off-by: codephage2020 --- .../src/variant_get/output/primitive.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs index 078120aef0ba..2adb89485d53 100644 --- a/parquet-variant-compute/src/variant_get/output/primitive.rs +++ b/parquet-variant-compute/src/variant_get/output/primitive.rs @@ -20,10 +20,9 @@ use crate::VariantArray; use arrow::error::Result; use arrow::array::{ - Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, - PrimitiveArray, + new_null_array, Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, + NullBufferBuilder, PrimitiveArray, }; -use arrow::buffer::NullBuffer; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::Int32Type; use arrow_schema::{ArrowError, FieldRef}; @@ -163,7 +162,10 @@ impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, _metadata: &BinaryViewArray, ) -> Result { // For all-null case, create a primitive array with all null values - Ok(Arc::new(new_null_array(self.as_type.data_type(), variant_array.len()))) + Ok(Arc::new(new_null_array( + self.as_type.data_type(), + variant_array.len(), + ))) } } From f790be71c5abebcbbadaab4af973e5f5653b0d91 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Tue, 19 Aug 2025 13:36:37 +0800 Subject: [PATCH 9/9] Updated comments Signed-off-by: codephage2020 --- parquet-variant-compute/src/variant_array.rs | 27 +++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index e0f4de5dbccf..9f07480f5588 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -187,7 +187,13 @@ impl VariantArray { typed_value_to_variant(typed_value, index) } } - ShreddingState::AllNull { .. } => Variant::Null, + ShreddingState::AllNull { .. } => { + // NOTE: This handles the case where neither value nor typed_value fields exist. + // For top-level variants, this returns Variant::Null (JSON null). + // For shredded object fields, this technically should indicate SQL NULL, + // but the current API cannot distinguish these contexts. + Variant::Null + } } } @@ -249,7 +255,12 @@ pub enum ShreddingState { value: BinaryViewArray, typed_value: ArrayRef, }, - /// All values are null, only metadata is present + /// All values are null, only metadata is present. + /// + /// This state occurs when neither `value` nor `typed_value` fields exist in the schema. + /// Note: By strict spec interpretation, this should only be valid for shredded object fields, + /// not top-level variants. However, we allow it and treat as Variant::Null for pragmatic + /// handling of missing data. AllNull { metadata: BinaryViewArray }, } @@ -442,7 +453,10 @@ mod test { fn all_null_missing_value_and_typed_value() { let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); let array = StructArray::new(fields, vec![make_binary_view_array()], None); - // Should succeed and create an AllNull variant when neither value nor typed_value are present + + // NOTE: By strict spec interpretation, this case (top-level variant with null/null) + // should be invalid, but we currently allow it and treat it as Variant::Null. + // This is a pragmatic decision to handle missing data gracefully. let variant_array = VariantArray::try_new(Arc::new(array)).unwrap(); // Verify the shredding state is AllNull @@ -450,6 +464,13 @@ mod test { variant_array.shredding_state(), ShreddingState::AllNull { .. } )); + + // Verify that value() returns Variant::Null (compensating for spec violation) + for i in 0..variant_array.len() { + if variant_array.is_valid(i) { + assert_eq!(variant_array.value(i), parquet_variant::Variant::Null); + } + } } #[test]