Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 106 additions & 7 deletions parquet-variant-compute/src/cast_to_variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ use arrow::array::{
TimestampSecondArray,
};
use arrow::datatypes::{
i256, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type,
Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type,
Int64Type, Int8Type, LargeBinaryType, Time32MillisecondType, Time32SecondType,
Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
i256, ArrowNativeType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type,
Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type,
Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType,
Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type,
UInt64Type, UInt8Type,
};
use arrow::temporal_conversions::{
timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime,
Expand Down Expand Up @@ -502,6 +503,17 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
builder
);
}
DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
DataType::Int16 => process_run_end_encoded::<Int16Type>(input, &mut builder)?,
DataType::Int32 => process_run_end_encoded::<Int32Type>(input, &mut builder)?,
DataType::Int64 => process_run_end_encoded::<Int64Type>(input, &mut builder)?,
_ => {
return Err(ArrowError::CastError(format!(
"Unsupported run ends type: {:?}",
run_ends.data_type()
)));
}
},
DataType::Dictionary(_, _) => {
let dict_array = input.as_any_dictionary();
let values_variant_array = cast_to_variant(dict_array.values().as_ref())?;
Expand Down Expand Up @@ -532,6 +544,41 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
Ok(builder.build())
}

/// Generic function to process run-end encoded arrays
fn process_run_end_encoded<R: RunEndIndexType>(
input: &dyn Array,
builder: &mut VariantArrayBuilder,
) -> Result<(), ArrowError> {
let run_array = input.as_run::<R>();
let values_variant_array = cast_to_variant(run_array.values().as_ref())?;

// Process runs in batches for better performance
let run_ends = run_array.run_ends().values();
let mut logical_start = 0;

for (physical_idx, &run_end) in run_ends.iter().enumerate() {
let logical_end = run_end.as_usize();
let run_length = logical_end - logical_start;

if values_variant_array.is_null(physical_idx) {
// Append nulls for the entire run
for _ in 0..run_length {
builder.append_null();
}
} else {
// Get the value once and append it for the entire run
let value = values_variant_array.value(physical_idx);
for _ in 0..run_length {
builder.append_variant(value.clone());
}
}

logical_start = logical_end;
}

Ok(())
}

// TODO do we need a cast_with_options to allow specifying conversion behavior,
// e.g. how to handle overflows, whether to convert to Variant::Null or return
// an error, etc. ?
Expand All @@ -544,9 +591,9 @@ mod tests {
Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray, FixedSizeBinaryBuilder,
Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder,
Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray, LargeStringArray,
NullArray, StringArray, StringViewArray, StructArray, Time32MillisecondArray,
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array,
UInt64Array, UInt8Array,
NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray,
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
use arrow::buffer::NullBuffer;
use arrow_schema::{Field, Fields};
Expand Down Expand Up @@ -1847,6 +1894,58 @@ mod tests {
);
}

#[test]
fn test_cast_to_variant_run_end_encoded() {
let mut builder = StringRunBuilder::<Int32Type>::new();
builder.append_value("apple");
builder.append_value("apple");
builder.append_value("banana");
builder.append_value("banana");
builder.append_value("banana");
builder.append_value("cherry");
let run_array = builder.finish();

run_test(
Arc::new(run_array),
vec![
Some(Variant::from("apple")),
Some(Variant::from("apple")),
Some(Variant::from("banana")),
Some(Variant::from("banana")),
Some(Variant::from("banana")),
Some(Variant::from("cherry")),
],
);
}

#[test]
fn test_cast_to_variant_run_end_encoded_with_nulls() {
use arrow::array::StringRunBuilder;
use arrow::datatypes::Int32Type;

// Test run-end encoded array with nulls
let mut builder = StringRunBuilder::<Int32Type>::new();
builder.append_value("apple");
builder.append_null();
builder.append_value("banana");
builder.append_value("banana");
builder.append_null();
builder.append_null();
let run_array = builder.finish();

run_test(
Arc::new(run_array),
vec![
Some(Variant::from("apple")),
None,
Some(Variant::from("banana")),
Some(Variant::from("banana")),
None,
None,
],
);
}

#[test]
fn test_cast_to_variant_dictionary() {
let values = StringArray::from(vec!["apple", "banana", "cherry", "date"]);
Expand Down
Loading