Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions arrow-array/src/array/run_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ use crate::{

/// An array of [run-end encoded values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout)
///
/// This encoding is variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
/// This encoding is a variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
/// and is good for representing data containing same values repeated consecutively.
///
/// [`RunArray`] contains `run_ends` array and `values` array of same length.
/// The `run_ends` array stores the indexes at which the run ends. The `values` array
/// stores the value of each run. Below example illustrates how a logical array is represented in
/// stores the value of each run. The below example illustrates how a logical array is represented in
/// [`RunArray`]
///
///
Expand Down
5 changes: 4 additions & 1 deletion arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| FixedSizeList(_, _)
| Struct(_)
| Map(_, _)
| Dictionary(_, _),
| Dictionary(_, _)
| RunEndEncoded(_, _),
) => true,
// Dictionary/List conditions should be put in front of others
(Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => {
Expand Down Expand Up @@ -179,6 +180,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
_ => false,
}
}
// TODO: RunEndEncoded here?
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is handled in #7713

// cast one decimal type to another decimal type
(
Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _),
Expand Down Expand Up @@ -815,6 +817,7 @@ pub fn cast_with_options(
"Casting from type {from_type:?} to dictionary type {to_type:?} not supported",
))),
},
// TODO: RunEndEncoded here?
(List(_), List(to)) => cast_list_values::<i32>(array, to, cast_options),
(LargeList(_), LargeList(to)) => cast_list_values::<i64>(array, to, cast_options),
(List(_), LargeList(list_to)) => cast_list::<i32, i64>(array, list_to, cast_options),
Expand Down
2 changes: 1 addition & 1 deletion arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ pub enum DataType {
/// that contain many repeated values using less memory, but with
/// a higher CPU overhead for some operations.
///
/// This type mostly used to represent low cardinality string
/// This type is mostly used to represent low cardinality string
/// arrays or a limited set of primitive types as integers.
Dictionary(Box<DataType>, Box<DataType>),
/// Exact 32-bit width decimal value with precision and scale
Expand Down
27 changes: 27 additions & 0 deletions parquet/src/arrow/array_reader/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,28 @@ pub fn make_byte_array_reader(
pages, data_type, reader,
)))
}
// TODO eventually add a dedicated [`ArrayReader`] for REE
ArrowType::RunEndEncoded(_, ref val_field) => match val_field.data_type() {
ArrowType::Binary
| ArrowType::Utf8
| ArrowType::Decimal128(_, _)
| ArrowType::Decimal256(_, _) => {
let reader = GenericRecordReader::new(column_desc);
Ok(Box::new(ByteArrayReader::<i32>::new(
pages, data_type, reader,
)))
}
ArrowType::LargeUtf8 | ArrowType::LargeBinary => {
let reader = GenericRecordReader::new(column_desc);
Ok(Box::new(ByteArrayReader::<i64>::new(
pages, data_type, reader,
)))
}
_ => Err(general_err!(
"invalid run end encoded value type for byte array reader - {}",
data_type
)),
},
_ => Err(general_err!(
"invalid data type for byte array reader - {}",
data_type
Expand Down Expand Up @@ -147,6 +169,11 @@ impl<I: OffsetSizeTrait> ArrayReader for ByteArrayReader<I> {
.with_precision_and_scale(p, s)?;
Arc::new(decimal)
}
// TODO eventually add a dedicated [`ArrayReader`] for REE
ArrowType::RunEndEncoded(_, ref val_field) => {
let array = buffer.into_array(null_buffer, val_field.data_type().clone());
arrow_cast::cast(&array, &self.data_type)?
}
_ => buffer.into_array(null_buffer, self.data_type.clone()),
};

Expand Down
38 changes: 37 additions & 1 deletion parquet/src/arrow/arrow_writer/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use crate::util::bit_util::num_required_bits;
use crate::util::interner::{Interner, Storage};
use arrow_array::{
Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray,
LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
LargeBinaryArray, LargeStringArray, RunArray, StringArray, StringViewArray,
};
use arrow_schema::DataType;

Expand Down Expand Up @@ -59,6 +59,28 @@ macro_rules! downcast_dict_op {
};
}

macro_rules! downcast_ree_impl {
($array:ident, $key:ident, $val:ident, $op:expr $(, $arg:expr)*) => {{
$op($array
.as_any()
.downcast_ref::<RunArray<arrow_array::types::$key>>()
.unwrap()
.downcast::<$val>()
.unwrap()$(, $arg)*)
}};
}

macro_rules! downcast_ree_op {
($run_end_field:expr, $val:ident, $array:ident, $op:expr $(, $arg:expr)*) => {
match $run_end_field.data_type() {
DataType::Int16 => downcast_ree_impl!($array, Int16Type, $val, $op$(, $arg)*),
DataType::Int32 => downcast_ree_impl!($array, Int32Type, $val, $op$(, $arg)*),
DataType::Int64 => downcast_ree_impl!($array, Int64Type, $val, $op$(, $arg)*),
_ => unreachable!(),
}
};
}

macro_rules! downcast_op {
($data_type:expr, $array:ident, $op:expr $(, $arg:expr)*) => {
match $data_type {
Expand Down Expand Up @@ -90,6 +112,20 @@ macro_rules! downcast_op {
}
d => unreachable!("cannot downcast {} dictionary value to byte array", d),
},
DataType::RunEndEncoded(run_end, value) => match value.data_type() {
DataType::Utf8 => downcast_ree_op!(run_end, StringArray, $array, $op$(, $arg)*),
DataType::LargeUtf8 => {
downcast_ree_op!(run_end, LargeStringArray, $array, $op$(, $arg)*)
}
DataType::Binary => downcast_ree_op!(run_end, BinaryArray, $array, $op$(, $arg)*),
DataType::LargeBinary => {
downcast_ree_op!(run_end, LargeBinaryArray, $array, $op$(, $arg)*)
}
DataType::FixedSizeBinary(_) => {
downcast_ree_op!(run_end, FixedSizeBinaryArray, $array, $op$(, $arg)*)
}
d => unreachable!("cannot downcast {} run end encoded value to byte array", d),
},
d => unreachable!("cannot downcast {} to byte array", d),
}
};
Expand Down
4 changes: 4 additions & 0 deletions parquet/src/arrow/arrow_writer/levels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ impl LevelInfoBuilder {
_ => unreachable!(),
})
}
DataType::RunEndEncoded(_, v) if is_leaf(v.data_type()) => {
let levels = ArrayLevels::new(parent_ctx, is_nullable, array.clone());
Ok(Self::Primitive(levels))
}
d => Err(nyi_err!("Datatype {} is not yet supported", d)),
}
}
Expand Down
Loading