-
Notifications
You must be signed in to change notification settings - Fork 934
Directly decode String/BinaryView types from arrow-row format #6044
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a3d6ad4
a9025b6
c20455e
6451fe0
9eb41d3
e2656c9
47adf22
859de09
0921f32
d182830
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ use arrow_buffer::bit_util::ceil; | |
use arrow_buffer::MutableBuffer; | ||
use arrow_data::ArrayDataBuilder; | ||
use arrow_schema::{DataType, SortOptions}; | ||
use builder::make_view; | ||
|
||
/// The block size of the variable length encoding | ||
pub const BLOCK_SIZE: usize = 32; | ||
|
@@ -152,6 +153,8 @@ fn encode_blocks<const SIZE: usize>(out: &mut [u8], val: &[u8]) -> usize { | |
end_offset | ||
} | ||
|
||
/// Decodes a single block of data | ||
/// The `f` function accepts a slice of the decoded data, it may be called multiple times | ||
pub fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl FnMut(&[u8])) -> usize { | ||
let (non_empty_sentinel, continuation) = match options.descending { | ||
true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION), | ||
|
@@ -243,6 +246,69 @@ pub fn decode_binary<I: OffsetSizeTrait>( | |
unsafe { GenericBinaryArray::from(builder.build_unchecked()) } | ||
} | ||
|
||
fn decode_binary_view_inner( | ||
rows: &mut [&[u8]], | ||
options: SortOptions, | ||
check_utf8: bool, | ||
) -> BinaryViewArray { | ||
let len = rows.len(); | ||
|
||
let mut null_count = 0; | ||
|
||
let nulls = MutableBuffer::collect_bool(len, |x| { | ||
let valid = rows[x][0] != null_sentinel(options); | ||
null_count += !valid as usize; | ||
valid | ||
}); | ||
|
||
let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum(); | ||
let mut values = MutableBuffer::new(values_capacity); | ||
let mut views = BufferBuilder::<u128>::new(len); | ||
|
||
for row in rows { | ||
let start_offset = values.len(); | ||
let offset = decode_blocks(row, options, |b| values.extend_from_slice(b)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we could do something similar to the parquet decoder (decode the strings into 2 different buffers, one for utf8 short string validation that was thrown away after conversion). |
||
if row[0] == null_sentinel(options) { | ||
debug_assert_eq!(offset, 1); | ||
debug_assert_eq!(start_offset, values.len()); | ||
views.append(0); | ||
} else { | ||
// Safety: we just appended the data to the end of the buffer | ||
let val = unsafe { values.get_unchecked_mut(start_offset..) }; | ||
|
||
if options.descending { | ||
val.iter_mut().for_each(|o| *o = !*o); | ||
} | ||
|
||
let view = make_view(val, 0, start_offset as u32); | ||
views.append(view); | ||
} | ||
*row = &row[offset..]; | ||
} | ||
|
||
if check_utf8 { | ||
// the values contains all data, no matter if it is short or long | ||
// we can validate utf8 in one go. | ||
std::str::from_utf8(values.as_slice()).unwrap(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I looked at it seems like the The argument would be if the Rows came from a valid RowEncoder, any data that was encoded to If there is some way to construct There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the |
||
} | ||
|
||
let builder = ArrayDataBuilder::new(DataType::BinaryView) | ||
.len(len) | ||
.null_count(null_count) | ||
.null_bit_buffer(Some(nulls.into())) | ||
.add_buffer(views.finish()) | ||
.add_buffer(values.into()); | ||
|
||
// SAFETY: | ||
// Valid by construction above | ||
unsafe { BinaryViewArray::from(builder.build_unchecked()) } | ||
} | ||
|
||
/// Decodes a binary view array from `rows` with the provided `options` | ||
pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray { | ||
decode_binary_view_inner(rows, options, false) | ||
} | ||
|
||
/// Decodes a string array from `rows` with the provided `options` | ||
/// | ||
/// # Safety | ||
|
@@ -269,16 +335,6 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>( | |
GenericStringArray::from(builder.build_unchecked()) | ||
} | ||
|
||
/// Decodes a binary view array from `rows` with the provided `options` | ||
pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray { | ||
let decoded: GenericBinaryArray<i64> = decode_binary(rows, options); | ||
|
||
// Better performance might be to directly build the binary view instead of building to BinaryArray and then casting | ||
// I suspect that the overhead is not a big deal. | ||
// If it is, we can reimplement the `decode_binary_view` function to directly build the StringViewArray | ||
BinaryViewArray::from(&decoded) | ||
} | ||
|
||
/// Decodes a string view array from `rows` with the provided `options` | ||
/// | ||
/// # Safety | ||
|
@@ -289,9 +345,6 @@ pub unsafe fn decode_string_view( | |
options: SortOptions, | ||
validate_utf8: bool, | ||
) -> StringViewArray { | ||
let decoded: GenericStringArray<i64> = decode_string(rows, options, validate_utf8); | ||
// Better performance might be to directly build the string view instead of building to StringArray and then casting | ||
// I suspect that the overhead is not a big deal. | ||
// If it is, we can reimplement the `decode_string_view` function to directly build the StringViewArray | ||
StringViewArray::from(&decoded) | ||
let view = decode_binary_view_inner(rows, options, validate_utf8); | ||
view.to_string_view_unchecked() | ||
} |
Uh oh!
There was an error while loading. Please reload this page.