Skip to content

Commit 074bcb5

Browse files
XiangpengHaoalamb
andauthored
Directly decode String/BinaryView types from arrow-row format (#6044)
* add string view bench * check in new impl * add utf8 * quick utf8 validation * Update arrow-row/src/variable.rs Co-authored-by: Andrew Lamb <[email protected]> * address comments * update * Revert "address comments" This reverts commit e2656c9. * addr comments --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 0002b4d commit 074bcb5

File tree

2 files changed

+81
-16
lines changed

2 files changed

+81
-16
lines changed

arrow-row/src/variable.rs

Lines changed: 68 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use arrow_buffer::bit_util::ceil;
2222
use arrow_buffer::MutableBuffer;
2323
use arrow_data::ArrayDataBuilder;
2424
use arrow_schema::{DataType, SortOptions};
25+
use builder::make_view;
2526

2627
/// The block size of the variable length encoding
2728
pub const BLOCK_SIZE: usize = 32;
@@ -152,6 +153,8 @@ fn encode_blocks<const SIZE: usize>(out: &mut [u8], val: &[u8]) -> usize {
152153
end_offset
153154
}
154155

156+
/// Decodes a single block of data
157+
/// The `f` function accepts a slice of the decoded data, it may be called multiple times
155158
pub fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl FnMut(&[u8])) -> usize {
156159
let (non_empty_sentinel, continuation) = match options.descending {
157160
true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION),
@@ -243,6 +246,69 @@ pub fn decode_binary<I: OffsetSizeTrait>(
243246
unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
244247
}
245248

249+
fn decode_binary_view_inner(
250+
rows: &mut [&[u8]],
251+
options: SortOptions,
252+
check_utf8: bool,
253+
) -> BinaryViewArray {
254+
let len = rows.len();
255+
256+
let mut null_count = 0;
257+
258+
let nulls = MutableBuffer::collect_bool(len, |x| {
259+
let valid = rows[x][0] != null_sentinel(options);
260+
null_count += !valid as usize;
261+
valid
262+
});
263+
264+
let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum();
265+
let mut values = MutableBuffer::new(values_capacity);
266+
let mut views = BufferBuilder::<u128>::new(len);
267+
268+
for row in rows {
269+
let start_offset = values.len();
270+
let offset = decode_blocks(row, options, |b| values.extend_from_slice(b));
271+
if row[0] == null_sentinel(options) {
272+
debug_assert_eq!(offset, 1);
273+
debug_assert_eq!(start_offset, values.len());
274+
views.append(0);
275+
} else {
276+
// Safety: we just appended the data to the end of the buffer
277+
let val = unsafe { values.get_unchecked_mut(start_offset..) };
278+
279+
if options.descending {
280+
val.iter_mut().for_each(|o| *o = !*o);
281+
}
282+
283+
let view = make_view(val, 0, start_offset as u32);
284+
views.append(view);
285+
}
286+
*row = &row[offset..];
287+
}
288+
289+
if check_utf8 {
290+
// the values contains all data, no matter if it is short or long
291+
// we can validate utf8 in one go.
292+
std::str::from_utf8(values.as_slice()).unwrap();
293+
}
294+
295+
let builder = ArrayDataBuilder::new(DataType::BinaryView)
296+
.len(len)
297+
.null_count(null_count)
298+
.null_bit_buffer(Some(nulls.into()))
299+
.add_buffer(views.finish())
300+
.add_buffer(values.into());
301+
302+
// SAFETY:
303+
// Valid by construction above
304+
unsafe { BinaryViewArray::from(builder.build_unchecked()) }
305+
}
306+
307+
/// Decodes a binary view array from `rows` with the provided `options`
308+
pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray {
309+
decode_binary_view_inner(rows, options, false)
310+
}
311+
246312
/// Decodes a string array from `rows` with the provided `options`
247313
///
248314
/// # Safety
@@ -269,16 +335,6 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>(
269335
GenericStringArray::from(builder.build_unchecked())
270336
}
271337

272-
/// Decodes a binary view array from `rows` with the provided `options`
273-
pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray {
274-
let decoded: GenericBinaryArray<i64> = decode_binary(rows, options);
275-
276-
// Better performance might be to directly build the binary view instead of building to BinaryArray and then casting
277-
// I suspect that the overhead is not a big deal.
278-
// If it is, we can reimplement the `decode_binary_view` function to directly build the StringViewArray
279-
BinaryViewArray::from(&decoded)
280-
}
281-
282338
/// Decodes a string view array from `rows` with the provided `options`
283339
///
284340
/// # Safety
@@ -289,9 +345,6 @@ pub unsafe fn decode_string_view(
289345
options: SortOptions,
290346
validate_utf8: bool,
291347
) -> StringViewArray {
292-
let decoded: GenericStringArray<i64> = decode_string(rows, options, validate_utf8);
293-
// Better performance might be to directly build the string view instead of building to StringArray and then casting
294-
// I suspect that the overhead is not a big deal.
295-
// If it is, we can reimplement the `decode_string_view` function to directly build the StringViewArray
296-
StringViewArray::from(&decoded)
348+
let view = decode_binary_view_inner(rows, options, validate_utf8);
349+
view.to_string_view_unchecked()
297350
}

arrow/benches/row_format.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use arrow::datatypes::{Int64Type, UInt64Type};
2424
use arrow::row::{RowConverter, SortField};
2525
use arrow::util::bench_util::{
2626
create_boolean_array, create_dict_from_values, create_primitive_array,
27-
create_string_array_with_len, create_string_dict_array,
27+
create_string_array_with_len, create_string_dict_array, create_string_view_array_with_len,
2828
};
2929
use arrow_array::types::Int32Type;
3030
use arrow_array::Array;
@@ -87,6 +87,18 @@ fn row_bench(c: &mut Criterion) {
8787
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 100)) as ArrayRef];
8888
do_bench(c, "4096 string(100, 0.5)", cols);
8989

90+
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10, false)) as ArrayRef];
91+
do_bench(c, "4096 string view(10, 0)", cols);
92+
93+
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef];
94+
do_bench(c, "4096 string view(30, 0)", cols);
95+
96+
let cols = vec![Arc::new(create_string_view_array_with_len(40960, 0., 100, false)) as ArrayRef];
97+
do_bench(c, "40960 string view(100, 0)", cols);
98+
99+
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, false)) as ArrayRef];
100+
do_bench(c, "4096 string view(100, 0.5)", cols);
101+
90102
let cols = vec![Arc::new(create_string_dict_array::<Int32Type>(4096, 0., 10)) as ArrayRef];
91103
do_bench(c, "4096 string_dictionary(10, 0)", cols);
92104

0 commit comments

Comments
 (0)