@@ -22,6 +22,7 @@ use arrow_buffer::bit_util::ceil;
22
22
use arrow_buffer:: MutableBuffer ;
23
23
use arrow_data:: ArrayDataBuilder ;
24
24
use arrow_schema:: { DataType , SortOptions } ;
25
+ use builder:: make_view;
25
26
26
27
/// The block size of the variable length encoding
27
28
pub const BLOCK_SIZE : usize = 32 ;
@@ -152,6 +153,8 @@ fn encode_blocks<const SIZE: usize>(out: &mut [u8], val: &[u8]) -> usize {
152
153
end_offset
153
154
}
154
155
156
+ /// Decodes a single block of data
157
+ /// The `f` function accepts a slice of the decoded data, it may be called multiple times
155
158
pub fn decode_blocks ( row : & [ u8 ] , options : SortOptions , mut f : impl FnMut ( & [ u8 ] ) ) -> usize {
156
159
let ( non_empty_sentinel, continuation) = match options. descending {
157
160
true => ( !NON_EMPTY_SENTINEL , !BLOCK_CONTINUATION ) ,
@@ -243,6 +246,69 @@ pub fn decode_binary<I: OffsetSizeTrait>(
243
246
unsafe { GenericBinaryArray :: from ( builder. build_unchecked ( ) ) }
244
247
}
245
248
249
+ fn decode_binary_view_inner (
250
+ rows : & mut [ & [ u8 ] ] ,
251
+ options : SortOptions ,
252
+ check_utf8 : bool ,
253
+ ) -> BinaryViewArray {
254
+ let len = rows. len ( ) ;
255
+
256
+ let mut null_count = 0 ;
257
+
258
+ let nulls = MutableBuffer :: collect_bool ( len, |x| {
259
+ let valid = rows[ x] [ 0 ] != null_sentinel ( options) ;
260
+ null_count += !valid as usize ;
261
+ valid
262
+ } ) ;
263
+
264
+ let values_capacity: usize = rows. iter ( ) . map ( |row| decoded_len ( row, options) ) . sum ( ) ;
265
+ let mut values = MutableBuffer :: new ( values_capacity) ;
266
+ let mut views = BufferBuilder :: < u128 > :: new ( len) ;
267
+
268
+ for row in rows {
269
+ let start_offset = values. len ( ) ;
270
+ let offset = decode_blocks ( row, options, |b| values. extend_from_slice ( b) ) ;
271
+ if row[ 0 ] == null_sentinel ( options) {
272
+ debug_assert_eq ! ( offset, 1 ) ;
273
+ debug_assert_eq ! ( start_offset, values. len( ) ) ;
274
+ views. append ( 0 ) ;
275
+ } else {
276
+ // Safety: we just appended the data to the end of the buffer
277
+ let val = unsafe { values. get_unchecked_mut ( start_offset..) } ;
278
+
279
+ if options. descending {
280
+ val. iter_mut ( ) . for_each ( |o| * o = !* o) ;
281
+ }
282
+
283
+ let view = make_view ( val, 0 , start_offset as u32 ) ;
284
+ views. append ( view) ;
285
+ }
286
+ * row = & row[ offset..] ;
287
+ }
288
+
289
+ if check_utf8 {
290
+ // the values contains all data, no matter if it is short or long
291
+ // we can validate utf8 in one go.
292
+ std:: str:: from_utf8 ( values. as_slice ( ) ) . unwrap ( ) ;
293
+ }
294
+
295
+ let builder = ArrayDataBuilder :: new ( DataType :: BinaryView )
296
+ . len ( len)
297
+ . null_count ( null_count)
298
+ . null_bit_buffer ( Some ( nulls. into ( ) ) )
299
+ . add_buffer ( views. finish ( ) )
300
+ . add_buffer ( values. into ( ) ) ;
301
+
302
+ // SAFETY:
303
+ // Valid by construction above
304
+ unsafe { BinaryViewArray :: from ( builder. build_unchecked ( ) ) }
305
+ }
306
+
307
+ /// Decodes a binary view array from `rows` with the provided `options`
308
+ pub fn decode_binary_view ( rows : & mut [ & [ u8 ] ] , options : SortOptions ) -> BinaryViewArray {
309
+ decode_binary_view_inner ( rows, options, false )
310
+ }
311
+
246
312
/// Decodes a string array from `rows` with the provided `options`
247
313
///
248
314
/// # Safety
@@ -269,16 +335,6 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>(
269
335
GenericStringArray :: from ( builder. build_unchecked ( ) )
270
336
}
271
337
272
- /// Decodes a binary view array from `rows` with the provided `options`
273
- pub fn decode_binary_view ( rows : & mut [ & [ u8 ] ] , options : SortOptions ) -> BinaryViewArray {
274
- let decoded: GenericBinaryArray < i64 > = decode_binary ( rows, options) ;
275
-
276
- // Better performance might be to directly build the binary view instead of building to BinaryArray and then casting
277
- // I suspect that the overhead is not a big deal.
278
- // If it is, we can reimplement the `decode_binary_view` function to directly build the StringViewArray
279
- BinaryViewArray :: from ( & decoded)
280
- }
281
-
282
338
/// Decodes a string view array from `rows` with the provided `options`
283
339
///
284
340
/// # Safety
@@ -289,9 +345,6 @@ pub unsafe fn decode_string_view(
289
345
options : SortOptions ,
290
346
validate_utf8 : bool ,
291
347
) -> StringViewArray {
292
- let decoded: GenericStringArray < i64 > = decode_string ( rows, options, validate_utf8) ;
293
- // Better performance might be to directly build the string view instead of building to StringArray and then casting
294
- // I suspect that the overhead is not a big deal.
295
- // If it is, we can reimplement the `decode_string_view` function to directly build the StringViewArray
296
- StringViewArray :: from ( & decoded)
348
+ let view = decode_binary_view_inner ( rows, options, validate_utf8) ;
349
+ view. to_string_view_unchecked ( )
297
350
}
0 commit comments