From 73b83daaae245257a0c0aa765128b71596c774da Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Tue, 19 Aug 2025 21:16:33 -0400 Subject: [PATCH 1/3] [Variant]: Implement `DataType::RunEndEncoded` support for `cast_to_variant` kernel --- .../src/cast_to_variant.rs | 108 +++++++++++++++++- 1 file changed, 105 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 926a4d4efc97..f44423987a4f 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -502,6 +502,56 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { builder ); } + DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { + DataType::Int16 => { + let run_array = input.as_run::(); + let values_variant_array = cast_to_variant(run_array.values().as_ref())?; + + for i in 0..run_array.len() { + let physical_idx = run_array.get_physical_index(i); + if values_variant_array.is_null(physical_idx) { + builder.append_null(); + } else { + let value = values_variant_array.value(physical_idx); + builder.append_variant(value); + } + } + } + DataType::Int32 => { + let run_array = input.as_run::(); + let values_variant_array = cast_to_variant(run_array.values().as_ref())?; + + for i in 0..run_array.len() { + let physical_idx = run_array.get_physical_index(i); + if values_variant_array.is_null(physical_idx) { + builder.append_null(); + } else { + let value = values_variant_array.value(physical_idx); + builder.append_variant(value); + } + } + } + DataType::Int64 => { + let run_array = input.as_run::(); + let values_variant_array = cast_to_variant(run_array.values().as_ref())?; + + for i in 0..run_array.len() { + let physical_idx = run_array.get_physical_index(i); + if values_variant_array.is_null(physical_idx) { + builder.append_null(); + } else { + let value = values_variant_array.value(physical_idx); + builder.append_variant(value); + } + } + } + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported run ends type: {:?}", + run_ends.data_type() + ))); + } + }, dt => { return Err(ArrowError::CastError(format!( "Unsupported data type for casting to Variant: {dt:?}", @@ -523,9 +573,9 @@ mod tests { Decimal256Array, Decimal32Array, Decimal64Array, FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray, LargeStringArray, NullArray, - StringArray, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, + StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray, + Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, }; use arrow::buffer::NullBuffer; use arrow_schema::{Field, Fields}; @@ -1820,6 +1870,58 @@ mod tests { ); } + #[test] + fn test_cast_to_variant_run_end_encoded() { + let mut builder = StringRunBuilder::::new(); + builder.append_value("apple"); + builder.append_value("apple"); + builder.append_value("banana"); + builder.append_value("banana"); + builder.append_value("banana"); + builder.append_value("cherry"); + let run_array = builder.finish(); + + run_test( + Arc::new(run_array), + vec![ + Some(Variant::from("apple")), + Some(Variant::from("apple")), + Some(Variant::from("banana")), + Some(Variant::from("banana")), + Some(Variant::from("banana")), + Some(Variant::from("cherry")), + ], + ); + } + + #[test] + fn test_cast_to_variant_run_end_encoded_with_nulls() { + use arrow::array::StringRunBuilder; + use arrow::datatypes::Int32Type; + + // Test run-end encoded array with nulls + let mut builder = StringRunBuilder::::new(); + builder.append_value("apple"); + builder.append_null(); + builder.append_value("banana"); + builder.append_value("banana"); + builder.append_null(); + builder.append_null(); + let run_array = builder.finish(); + + run_test( + Arc::new(run_array), + vec![ + Some(Variant::from("apple")), + None, + Some(Variant::from("banana")), + Some(Variant::from("banana")), + None, + None, + ], + ); + } + /// Converts the given `Array` to a `VariantArray` and tests the conversion /// against the expected values. It also tests the handling of nulls by /// setting one element to null and verifying the output. From d2b1f2ca722f17689e16f035f2a15209429c1021 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Tue, 19 Aug 2025 21:25:30 -0400 Subject: [PATCH 2/3] Process runs in batches --- .../src/cast_to_variant.rs | 89 +++++++++---------- 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index f44423987a4f..cf9ef4d4c3e4 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -23,10 +23,11 @@ use arrow::array::{ TimestampSecondArray, }; use arrow::datatypes::{ - i256, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, - Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, - Int64Type, Int8Type, LargeBinaryType, Time32MillisecondType, Time32SecondType, - Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + i256, ArrowNativeType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, + Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, + Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType, + Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, + UInt64Type, UInt8Type, }; use arrow::temporal_conversions::{ timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, @@ -503,48 +504,9 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { ); } DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { - DataType::Int16 => { - let run_array = input.as_run::(); - let values_variant_array = cast_to_variant(run_array.values().as_ref())?; - - for i in 0..run_array.len() { - let physical_idx = run_array.get_physical_index(i); - if values_variant_array.is_null(physical_idx) { - builder.append_null(); - } else { - let value = values_variant_array.value(physical_idx); - builder.append_variant(value); - } - } - } - DataType::Int32 => { - let run_array = input.as_run::(); - let values_variant_array = cast_to_variant(run_array.values().as_ref())?; - - for i in 0..run_array.len() { - let physical_idx = run_array.get_physical_index(i); - if values_variant_array.is_null(physical_idx) { - builder.append_null(); - } else { - let value = values_variant_array.value(physical_idx); - builder.append_variant(value); - } - } - } - DataType::Int64 => { - let run_array = input.as_run::(); - let values_variant_array = cast_to_variant(run_array.values().as_ref())?; - - for i in 0..run_array.len() { - let physical_idx = run_array.get_physical_index(i); - if values_variant_array.is_null(physical_idx) { - builder.append_null(); - } else { - let value = values_variant_array.value(physical_idx); - builder.append_variant(value); - } - } - } + DataType::Int16 => process_run_end_encoded::(input, &mut builder)?, + DataType::Int32 => process_run_end_encoded::(input, &mut builder)?, + DataType::Int64 => process_run_end_encoded::(input, &mut builder)?, _ => { return Err(ArrowError::CastError(format!( "Unsupported run ends type: {:?}", @@ -561,6 +523,41 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { Ok(builder.build()) } +/// Generic function to process run-end encoded arrays +fn process_run_end_encoded( + input: &dyn Array, + builder: &mut VariantArrayBuilder, +) -> Result<(), ArrowError> { + let run_array = input.as_run::(); + let values_variant_array = cast_to_variant(run_array.values().as_ref())?; + + // Process runs in batches for better performance + let run_ends = run_array.run_ends().values(); + let mut logical_start = 0; + + for (physical_idx, &run_end) in run_ends.iter().enumerate() { + let logical_end = run_end.as_usize(); + let run_length = logical_end - logical_start; + + if values_variant_array.is_null(physical_idx) { + // Append nulls for the entire run + for _ in 0..run_length { + builder.append_null(); + } + } else { + // Get the value once and append it for the entire run + let value = values_variant_array.value(physical_idx); + for _ in 0..run_length { + builder.append_variant(value.clone()); + } + } + + logical_start = logical_end; + } + + Ok(()) +} + // TODO do we need a cast_with_options to allow specifying conversion behavior, // e.g. how to handle overflows, whether to convert to Variant::Null or return // an error, etc. ? From adf520860066a6e358a6166ef87992a0bc510fcd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 20 Aug 2025 15:01:46 -0400 Subject: [PATCH 3/3] reorder tests for consistency --- .../src/cast_to_variant.rs | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index a9f84ffe65b6..43ee8ccb3929 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -1918,25 +1918,6 @@ mod tests { ); } - #[test] - fn test_cast_to_variant_dictionary() { - let values = StringArray::from(vec!["apple", "banana", "cherry", "date"]); - let keys = Int32Array::from(vec![Some(0), Some(1), None, Some(2), Some(0), Some(3)]); - let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - - run_test( - Arc::new(dict_array), - vec![ - Some(Variant::from("apple")), - Some(Variant::from("banana")), - None, - Some(Variant::from("cherry")), - Some(Variant::from("apple")), - Some(Variant::from("date")), - ], - ); - } - #[test] fn test_cast_to_variant_run_end_encoded_with_nulls() { use arrow::array::StringRunBuilder; @@ -1965,6 +1946,25 @@ mod tests { ); } + #[test] + fn test_cast_to_variant_dictionary() { + let values = StringArray::from(vec!["apple", "banana", "cherry", "date"]); + let keys = Int32Array::from(vec![Some(0), Some(1), None, Some(2), Some(0), Some(3)]); + let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); + + run_test( + Arc::new(dict_array), + vec![ + Some(Variant::from("apple")), + Some(Variant::from("banana")), + None, + Some(Variant::from("cherry")), + Some(Variant::from("apple")), + Some(Variant::from("date")), + ], + ); + } + #[test] fn test_cast_to_variant_dictionary_with_nulls() { // Test dictionary with null values in the values array