Skip to content
Merged
134 changes: 121 additions & 13 deletions parquet-variant-compute/src/variant_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,13 @@ impl VariantArray {
typed_value_to_variant(typed_value, index)
}
}
ShreddingState::AllNull { .. } => {
// NOTE: This handles the case where neither value nor typed_value fields exist.
// For top-level variants, this returns Variant::Null (JSON null).
// For shredded object fields, this technically should indicate SQL NULL,
// but the current API cannot distinguish these contexts.
Variant::Null
}
}
}

Expand Down Expand Up @@ -226,9 +233,6 @@ impl VariantArray {
/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
#[derive(Debug)]
pub enum ShreddingState {
// TODO: add missing state where there is neither value nor typed_value
// https://github.com/apache/arrow-rs/issues/8088
// Missing { metadata: BinaryViewArray },
/// This variant has no typed_value field
Unshredded {
metadata: BinaryViewArray,
Expand All @@ -251,6 +255,13 @@ pub enum ShreddingState {
value: BinaryViewArray,
typed_value: ArrayRef,
},
/// All values are null, only metadata is present.
///
/// This state occurs when neither `value` nor `typed_value` fields exist in the schema.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a reasonable approach for now. If we find a reason to restrict AllNull to only sub fields at some later point, then we can add additional code / restrictions

/// Note: By strict spec interpretation, this should only be valid for shredded object fields,
/// not top-level variants. However, we allow it and treat as Variant::Null for pragmatic
/// handling of missing data.
AllNull { metadata: BinaryViewArray },
}

impl ShreddingState {
Expand All @@ -271,9 +282,7 @@ impl ShreddingState {
metadata,
typed_value,
}),
(_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from(
"VariantArray has neither value nor typed_value field",
))),
(metadata, None, None) => Ok(Self::AllNull { metadata }),
}
}

Expand All @@ -283,6 +292,7 @@ impl ShreddingState {
ShreddingState::Unshredded { metadata, .. } => metadata,
ShreddingState::Typed { metadata, .. } => metadata,
ShreddingState::PartiallyShredded { metadata, .. } => metadata,
ShreddingState::AllNull { metadata } => metadata,
}
}

Expand All @@ -292,6 +302,7 @@ impl ShreddingState {
ShreddingState::Unshredded { value, .. } => Some(value),
ShreddingState::Typed { .. } => None,
ShreddingState::PartiallyShredded { value, .. } => Some(value),
ShreddingState::AllNull { .. } => None,
}
}

Expand All @@ -301,6 +312,7 @@ impl ShreddingState {
ShreddingState::Unshredded { .. } => None,
ShreddingState::Typed { typed_value, .. } => Some(typed_value),
ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value),
ShreddingState::AllNull { .. } => None,
}
}

Expand All @@ -327,6 +339,9 @@ impl ShreddingState {
value: value.slice(offset, length),
typed_value: typed_value.slice(offset, length),
},
ShreddingState::AllNull { metadata } => ShreddingState::AllNull {
metadata: metadata.slice(offset, length),
},
}
}
}
Expand Down Expand Up @@ -435,15 +450,27 @@ mod test {
}

#[test]
fn invalid_missing_value() {
fn all_null_missing_value_and_typed_value() {
let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
let array = StructArray::new(fields, vec![make_binary_view_array()], None);
// Should fail because the StructArray does not contain a 'value' field
let err = VariantArray::try_new(Arc::new(array));
assert_eq!(
err.unwrap_err().to_string(),
"Invalid argument error: VariantArray has neither value nor typed_value field"
);

// NOTE: By strict spec interpretation, this case (top-level variant with null/null)
// should be invalid, but we currently allow it and treat it as Variant::Null.
// This is a pragmatic decision to handle missing data gracefully.
let variant_array = VariantArray::try_new(Arc::new(array)).unwrap();

// Verify the shredding state is AllNull
assert!(matches!(
variant_array.shredding_state(),
ShreddingState::AllNull { .. }
));

// Verify that value() returns Variant::Null (compensating for spec violation)
for i in 0..variant_array.len() {
if variant_array.is_valid(i) {
assert_eq!(variant_array.value(i), parquet_variant::Variant::Null);
}
}
}

#[test]
Expand Down Expand Up @@ -489,4 +516,85 @@ mod test {
fn make_binary_array() -> ArrayRef {
Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
}

#[test]
fn all_null_shredding_state() {
let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]);
let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap();

assert!(matches!(shredding_state, ShreddingState::AllNull { .. }));

// Verify metadata is preserved correctly
if let ShreddingState::AllNull { metadata: m } = shredding_state {
assert_eq!(m.len(), metadata.len());
assert_eq!(m.value(0), metadata.value(0));
}
}

#[test]
fn all_null_variant_array_construction() {
let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
let nulls = NullBuffer::from(vec![false, false, false]); // all null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that this case (where there is a value field present, but it is all null) should be treated as though there was no value field?

I haven't double checked the spec (probably the Arrow one) but this feels like it may be out of compliance

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I know, the spec's requirement that "both value and typed_value are null" meaning "the value is missing."
We can see from the spec's example:
refer to here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I got it. I will add a test for the case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a comprehensive test that demonstrates when a value field exists in the schema but contains all null values, it correctly remains in the Unshredded state rather than AllNull.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I understand, the treatment of NULL/NULL depends on context:

  • For a top-level variant value, it's interpreted as Variant::Null
  • For a shredded variant object field, it's interpreted as missing (SQL NULL)

So I guess there are two ways to get SQL NULL -- null mask on the struct(value, typed_value), or if both value and typed_value are themselves NULL?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah -- the spec requires that the struct ("group") containing value and typed_value columns must be non-nullable:

The group for each named field must use repetition level required. A field's value and typed_value are set to null (missing) to indicate that the field does not exist in the variant. To encode a field that is present with a [variant/JSON, not SQL] null value, the value must contain a Variant null: basic type 0 (primitive) and physical type 0 (null).

So effectively, the NULL/NULL combo becomes the null mask for that nested field. Which is why a top-level NULL/NULL combo is incorrect -- the top-level field already has its own nullability.


let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));

let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap();

// Verify the shredding state is AllNull
assert!(matches!(
variant_array.shredding_state(),
ShreddingState::AllNull { .. }
));

// Verify all values are null
assert_eq!(variant_array.len(), 3);
assert!(!variant_array.is_valid(0));
assert!(!variant_array.is_valid(1));
assert!(!variant_array.is_valid(2));

// Verify that value() returns Variant::Null for all indices
for i in 0..variant_array.len() {
assert!(
!variant_array.is_valid(i),
"Expected value at index {i} to be null"
);
}
}

#[test]
fn value_field_present_but_all_null_should_be_unshredded() {
// This test demonstrates the issue: when a value field exists in schema
// but all its values are null, it should remain Unshredded, not AllNull
let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);

// Create a value field with all null values
let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
let value_data = value_array
.to_data()
.into_builder()
.nulls(Some(value_nulls))
.build()
.unwrap();
let value = BinaryViewArray::from(value_data);

let fields = Fields::from(vec![
Field::new("metadata", DataType::BinaryView, false),
Field::new("value", DataType::BinaryView, true), // Field exists in schema
]);
let struct_array = StructArray::new(
fields,
vec![Arc::new(metadata), Arc::new(value)],
None, // struct itself is not null, just the value field is all null
);

let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap();

// This should be Unshredded, not AllNull, because value field exists in schema
assert!(matches!(
variant_array.shredding_state(),
ShreddingState::Unshredded { .. }
));
}
}
73 changes: 73 additions & 0 deletions parquet-variant-compute/src/variant_get/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
ShreddingState::Unshredded { metadata, value } => {
output_builder.unshredded(variant_array, metadata, value)
}
ShreddingState::AllNull { metadata } => output_builder.all_null(variant_array, metadata),
}
}

Expand Down Expand Up @@ -284,6 +285,40 @@ mod test {
assert_eq!(&result, &expected)
}

/// AllNull: extract a value as a VariantArray
#[test]
fn get_variant_all_null_as_variant() {
let array = all_null_variant_array();
let options = GetOptions::new();
let result = variant_get(&array, options).unwrap();

// expect the result is a VariantArray
let result: &VariantArray = result.as_any().downcast_ref().unwrap();
assert_eq!(result.len(), 3);

// All values should be null
assert!(!result.is_valid(0));
assert!(!result.is_valid(1));
assert!(!result.is_valid(2));
}

/// AllNull: extract a value as an Int32Array
#[test]
fn get_variant_all_null_as_int32() {
let array = all_null_variant_array();
// specify we want the typed value as Int32
let field = Field::new("typed_value", DataType::Int32, true);
let options = GetOptions::new().with_as_type(Some(FieldRef::from(field)));
let result = variant_get(&array, options).unwrap();

let expected: ArrayRef = Arc::new(Int32Array::from(vec![
Option::<i32>::None,
Option::<i32>::None,
Option::<i32>::None,
]));
assert_eq!(&result, &expected)
}

/// Return a VariantArray that represents a perfectly "shredded" variant
/// for the following example (3 Variant::Int32 values):
///
Expand Down Expand Up @@ -427,4 +462,42 @@ mod test {
StructArray::new(Fields::from(fields), arrays, nulls)
}
}

/// Return a VariantArray that represents an "all null" variant
/// for the following example (3 null values):
///
/// ```text
/// null
/// null
/// null
/// ```
///
/// The schema of the corresponding `StructArray` would look like this:
///
/// ```text
/// StructArray {
/// metadata: BinaryViewArray,
/// }
/// ```
fn all_null_variant_array() -> ArrayRef {
let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() };

let nulls = NullBuffer::from(vec![
false, // row 0 is null
false, // row 1 is null
false, // row 2 is null
]);

// metadata is the same for all rows (though they're all null)
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3));

let struct_array = StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata))
.with_nulls(nulls)
.build();

Arc::new(
VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"),
)
}
}
7 changes: 7 additions & 0 deletions parquet-variant-compute/src/variant_get/output/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ pub(crate) trait OutputBuilder {
metadata: &BinaryViewArray,
value_field: &BinaryViewArray,
) -> Result<ArrayRef>;

/// write out an all-null variant array
fn all_null(
&self,
variant_array: &VariantArray,
metadata: &BinaryViewArray,
) -> Result<ArrayRef>;
}

pub(crate) fn instantiate_output_builder<'a>(
Expand Down
16 changes: 14 additions & 2 deletions parquet-variant-compute/src/variant_get/output/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ use crate::VariantArray;
use arrow::error::Result;

use arrow::array::{
Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder,
PrimitiveArray,
new_null_array, Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray,
NullBufferBuilder, PrimitiveArray,
};
use arrow::compute::{cast_with_options, CastOptions};
use arrow::datatypes::Int32Type;
Expand Down Expand Up @@ -157,6 +157,18 @@ impl<T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'_, T> {
"variant_get unshredded to primitive types is not implemented yet",
)))
}

fn all_null(
&self,
variant_array: &VariantArray,
_metadata: &BinaryViewArray,
) -> Result<ArrayRef> {
// For all-null case, create a primitive array with all null values
Ok(Arc::new(new_null_array(
self.as_type.data_type(),
variant_array.len(),
)))
}
}

impl ArrowPrimitiveVariant for Int32Type {
Expand Down
13 changes: 13 additions & 0 deletions parquet-variant-compute/src/variant_get/output/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,17 @@ impl OutputBuilder for VariantOutputBuilder<'_> {

Ok(Arc::new(builder.build()))
}

fn all_null(
&self,
variant_array: &VariantArray,
_metadata: &BinaryViewArray,
) -> arrow::error::Result<ArrayRef> {
// For all-null case, simply create a VariantArray with all null values
let mut builder = VariantArrayBuilder::new(variant_array.len());
for _i in 0..variant_array.len() {
builder.append_null();
}
Ok(Arc::new(builder.build()))
}
}
Loading