From 8421b963d32127341e8ebf0a50ac26a31239dfc3 Mon Sep 17 00:00:00 2001 From: ShiKaiWi Date: Fri, 20 Sep 2024 23:48:13 +0800 Subject: [PATCH 01/10] Use `Arc<[Buffer]>` instead of raw `Vec` in `GenericByteViewArray` for faster `slice` --- arrow-array/src/array/byte_view_array.rs | 28 +++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 713e275d186c..f7c4af417d23 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -162,7 +162,7 @@ use super::ByteArrayType; pub struct GenericByteViewArray { data_type: DataType, views: ScalarBuffer, - buffers: Vec, + buffers: Arc<[Buffer]>, phantom: PhantomData, nulls: Option, } @@ -216,7 +216,7 @@ impl GenericByteViewArray { Ok(Self { data_type: T::DATA_TYPE, views, - buffers, + buffers: buffers.into(), nulls, phantom: Default::default(), }) @@ -229,7 +229,7 @@ impl GenericByteViewArray { /// Safe if [`Self::try_new`] would not error pub unsafe fn new_unchecked( views: ScalarBuffer, - buffers: Vec, + buffers: impl Into>, nulls: Option, ) -> Self { if cfg!(feature = "force_validate") { @@ -240,7 +240,7 @@ impl GenericByteViewArray { data_type: T::DATA_TYPE, phantom: Default::default(), views, - buffers, + buffers: buffers.into(), nulls, } } @@ -250,7 +250,7 @@ impl GenericByteViewArray { Self { data_type: T::DATA_TYPE, views: vec![0; len].into(), - buffers: vec![], + buffers: vec![].into(), nulls: Some(NullBuffer::new_null(len)), phantom: Default::default(), } @@ -276,7 +276,7 @@ impl GenericByteViewArray { } /// Deconstruct this array into its constituent parts - pub fn into_parts(self) -> (ScalarBuffer, Vec, Option) { + pub fn into_parts(self) -> (ScalarBuffer, Arc<[Buffer]>, Option) { (self.views, self.buffers, self.nulls) } @@ -670,7 +670,7 @@ impl From for GenericByteViewArray { Self { data_type: T::DATA_TYPE, views, - buffers, + buffers: buffers.into(), nulls: value.nulls().cloned(), phantom: Default::default(), } @@ -734,12 +734,20 @@ where } impl From> for ArrayData { - fn from(mut array: GenericByteViewArray) -> Self { + fn from(array: GenericByteViewArray) -> Self { let len = array.len(); - array.buffers.insert(0, array.views.into_inner()); + let new_buffers = { + let mut buffers = Vec::with_capacity(array.buffers.len() + 1); + buffers.push(array.views.into_inner()); + for buffer in array.buffers.iter() { + buffers.push(buffer.clone()); + } + buffers + }; + let builder = ArrayDataBuilder::new(T::DATA_TYPE) .len(len) - .buffers(array.buffers) + .buffers(new_buffers) .nulls(array.nulls); unsafe { builder.build_unchecked() } From 2d9e7af3bbccd81c367452a9bfa4a923c8f4fc80 Mon Sep 17 00:00:00 2001 From: ShiKaiWi Date: Sat, 21 Sep 2024 00:30:04 +0800 Subject: [PATCH 02/10] add benchmark case about view array slice --- arrow-array/Cargo.toml | 2 +- arrow-array/benches/{gc_view_types.rs => view_types.rs} | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) rename arrow-array/benches/{gc_view_types.rs => view_types.rs} (92%) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index a65c0c9ca8e6..8ebe21c70772 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -64,7 +64,7 @@ name = "occupancy" harness = false [[bench]] -name = "gc_view_types" +name = "view_types" harness = false [[bench]] diff --git a/arrow-array/benches/gc_view_types.rs b/arrow-array/benches/view_types.rs similarity index 92% rename from arrow-array/benches/gc_view_types.rs rename to arrow-array/benches/view_types.rs index cab60b47af79..b7507d4a2c9d 100644 --- a/arrow-array/benches/gc_view_types.rs +++ b/arrow-array/benches/view_types.rs @@ -43,6 +43,12 @@ fn criterion_benchmark(c: &mut Criterion) { hint::black_box(sliced.gc()); }); }); + + c.bench_function("view types slice", |b| { + b.iter(|| { + black_box(array.slice(0, 100_000 / 2)); + }); + }); } criterion_group!(benches, criterion_benchmark); From 907819996ca543cab78d2aaef05a12832694ff92 Mon Sep 17 00:00:00 2001 From: ctsk <9384305+ctsk@users.noreply.github.com> Date: Wed, 25 Jun 2025 15:13:59 +0200 Subject: [PATCH 03/10] Fixup rebase errors --- arrow-array/src/array/byte_view_array.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index f7c4af417d23..249cd0b5e8b6 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -185,7 +185,11 @@ impl GenericByteViewArray { /// # Panics /// /// Panics if [`GenericByteViewArray::try_new`] returns an error - pub fn new(views: ScalarBuffer, buffers: Vec, nulls: Option) -> Self { + pub fn new( + views: ScalarBuffer, + buffers: impl Into>, + nulls: Option, + ) -> Self { Self::try_new(views, buffers, nulls).unwrap() } @@ -197,9 +201,11 @@ impl GenericByteViewArray { /// * [ByteViewType::validate] fails pub fn try_new( views: ScalarBuffer, - buffers: Vec, + buffers: impl Into>, nulls: Option, ) -> Result { + let buffers: Arc<[Buffer]> = buffers.into(); + T::validate(&views, &buffers)?; if let Some(n) = nulls.as_ref() { @@ -216,7 +222,7 @@ impl GenericByteViewArray { Ok(Self { data_type: T::DATA_TYPE, views, - buffers: buffers.into(), + buffers, nulls, phantom: Default::default(), }) @@ -607,8 +613,9 @@ impl Array for GenericByteViewArray { fn shrink_to_fit(&mut self) { self.views.shrink_to_fit(); - self.buffers.iter_mut().for_each(|b| b.shrink_to_fit()); - self.buffers.shrink_to_fit(); + if let Some(buffers) = Arc::get_mut(&mut self.buffers) { + buffers.iter_mut().for_each(|b| b.shrink_to_fit()); + } if let Some(nulls) = &mut self.nulls { nulls.shrink_to_fit(); } From 0748cc0a3c7559d921ebd4d16682974afee9f7c1 Mon Sep 17 00:00:00 2001 From: ctsk <9384305+ctsk@users.noreply.github.com> Date: Wed, 25 Jun 2025 15:27:35 +0200 Subject: [PATCH 04/10] Implement ViewBuffers abstraction --- arrow-array/src/lib.rs | 2 ++ arrow-array/src/view_buffers.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 arrow-array/src/view_buffers.rs diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 91696540d219..fd38e59900d6 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -259,6 +259,8 @@ pub mod temporal_conversions; pub mod timezone; mod trusted_len; pub mod types; +mod view_buffers; +pub use view_buffers::ViewBuffers; #[cfg(test)] mod tests { diff --git a/arrow-array/src/view_buffers.rs b/arrow-array/src/view_buffers.rs new file mode 100644 index 000000000000..10299c9e9e4c --- /dev/null +++ b/arrow-array/src/view_buffers.rs @@ -0,0 +1,27 @@ +use std::sync::Arc; + +use arrow_buffer::Buffer; + +/// A cheaply cloneable, owned slice of [`Buffer`] +/// +/// Similar to `Arc>` or `Arc<[Buffer]>` +#[derive(Clone, Debug)] +pub struct ViewBuffers(pub(crate) Arc<[Buffer]>); + +impl FromIterator for ViewBuffers { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl From> for ViewBuffers { + fn from(value: Vec) -> Self { + Self(value.into()) + } +} + +impl From<&[Buffer]> for ViewBuffers { + fn from(value: &[Buffer]) -> Self { + Self(value.into()) + } +} From aaa2341ab93c3ac0f583e82cabd5d301c6c113d2 Mon Sep 17 00:00:00 2001 From: ctsk <9384305+ctsk@users.noreply.github.com> Date: Wed, 25 Jun 2025 15:28:34 +0200 Subject: [PATCH 05/10] Use ViewBuffers abstraction --- arrow-array/src/array/byte_view_array.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 249cd0b5e8b6..b56e646b4a3f 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -20,7 +20,9 @@ use crate::builder::{ArrayBuilder, GenericByteViewBuilder}; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; -use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar}; +use crate::{ + Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar, ViewBuffers, +}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; use arrow_schema::{ArrowError, DataType}; @@ -187,7 +189,7 @@ impl GenericByteViewArray { /// Panics if [`GenericByteViewArray::try_new`] returns an error pub fn new( views: ScalarBuffer, - buffers: impl Into>, + buffers: impl Into, nulls: Option, ) -> Self { Self::try_new(views, buffers, nulls).unwrap() @@ -201,10 +203,10 @@ impl GenericByteViewArray { /// * [ByteViewType::validate] fails pub fn try_new( views: ScalarBuffer, - buffers: impl Into>, + buffers: impl Into, nulls: Option, ) -> Result { - let buffers: Arc<[Buffer]> = buffers.into(); + let buffers: Arc<[Buffer]> = buffers.into().0; T::validate(&views, &buffers)?; @@ -235,7 +237,7 @@ impl GenericByteViewArray { /// Safe if [`Self::try_new`] would not error pub unsafe fn new_unchecked( views: ScalarBuffer, - buffers: impl Into>, + buffers: impl Into, nulls: Option, ) -> Self { if cfg!(feature = "force_validate") { @@ -246,7 +248,7 @@ impl GenericByteViewArray { data_type: T::DATA_TYPE, phantom: Default::default(), views, - buffers: buffers.into(), + buffers: buffers.into().0, nulls, } } @@ -810,7 +812,7 @@ impl BinaryViewArray { /// # Safety /// Caller is responsible for ensuring that items in array are utf8 data. pub unsafe fn to_string_view_unchecked(self) -> StringViewArray { - StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) + StringViewArray::new_unchecked(self.views, ViewBuffers(self.buffers), self.nulls) } } @@ -842,7 +844,7 @@ pub type StringViewArray = GenericByteViewArray; impl StringViewArray { /// Convert the [`StringViewArray`] to [`BinaryViewArray`] pub fn to_binary_view(self) -> BinaryViewArray { - unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) } + unsafe { BinaryViewArray::new_unchecked(self.views, ViewBuffers(self.buffers), self.nulls) } } /// Returns true if all data within this array is ASCII From 25e215a211007eb3d91938f2a4faf33af9d8656f Mon Sep 17 00:00:00 2001 From: ctsk <9384305+ctsk@users.noreply.github.com> Date: Wed, 25 Jun 2025 15:45:28 +0200 Subject: [PATCH 06/10] Add license --- arrow-array/src/view_buffers.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arrow-array/src/view_buffers.rs b/arrow-array/src/view_buffers.rs index 10299c9e9e4c..fe1609e0180d 100644 --- a/arrow-array/src/view_buffers.rs +++ b/arrow-array/src/view_buffers.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::sync::Arc; use arrow_buffer::Buffer; From 9eda1136d72b8b83a1bcfc6e45c54f1cd259c275 Mon Sep 17 00:00:00 2001 From: ctsk <9384305+ctsk@users.noreply.github.com> Date: Wed, 25 Jun 2025 23:03:18 +0200 Subject: [PATCH 07/10] Keep encapsulation in struct --- arrow-array/src/array/byte_view_array.rs | 14 +++++++------- arrow-array/src/view_buffers.rs | 10 +++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index b56e646b4a3f..50ff6ccdfe15 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -164,7 +164,7 @@ use super::ByteArrayType; pub struct GenericByteViewArray { data_type: DataType, views: ScalarBuffer, - buffers: Arc<[Buffer]>, + buffers: ViewBuffers, phantom: PhantomData, nulls: Option, } @@ -206,7 +206,7 @@ impl GenericByteViewArray { buffers: impl Into, nulls: Option, ) -> Result { - let buffers: Arc<[Buffer]> = buffers.into().0; + let buffers = buffers.into(); T::validate(&views, &buffers)?; @@ -248,7 +248,7 @@ impl GenericByteViewArray { data_type: T::DATA_TYPE, phantom: Default::default(), views, - buffers: buffers.into().0, + buffers: buffers.into(), nulls, } } @@ -284,7 +284,7 @@ impl GenericByteViewArray { } /// Deconstruct this array into its constituent parts - pub fn into_parts(self) -> (ScalarBuffer, Arc<[Buffer]>, Option) { + pub fn into_parts(self) -> (ScalarBuffer, ViewBuffers, Option) { (self.views, self.buffers, self.nulls) } @@ -615,7 +615,7 @@ impl Array for GenericByteViewArray { fn shrink_to_fit(&mut self) { self.views.shrink_to_fit(); - if let Some(buffers) = Arc::get_mut(&mut self.buffers) { + if let Some(buffers) = Arc::get_mut(&mut self.buffers.0) { buffers.iter_mut().for_each(|b| b.shrink_to_fit()); } if let Some(nulls) = &mut self.nulls { @@ -812,7 +812,7 @@ impl BinaryViewArray { /// # Safety /// Caller is responsible for ensuring that items in array are utf8 data. pub unsafe fn to_string_view_unchecked(self) -> StringViewArray { - StringViewArray::new_unchecked(self.views, ViewBuffers(self.buffers), self.nulls) + StringViewArray::new_unchecked(self.views, self.buffers, self.nulls) } } @@ -844,7 +844,7 @@ pub type StringViewArray = GenericByteViewArray; impl StringViewArray { /// Convert the [`StringViewArray`] to [`BinaryViewArray`] pub fn to_binary_view(self) -> BinaryViewArray { - unsafe { BinaryViewArray::new_unchecked(self.views, ViewBuffers(self.buffers), self.nulls) } + unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) } } /// Returns true if all data within this array is ASCII diff --git a/arrow-array/src/view_buffers.rs b/arrow-array/src/view_buffers.rs index fe1609e0180d..98a3c62b782b 100644 --- a/arrow-array/src/view_buffers.rs +++ b/arrow-array/src/view_buffers.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; +use std::{ops::Deref, sync::Arc}; use arrow_buffer::Buffer; @@ -42,3 +42,11 @@ impl From<&[Buffer]> for ViewBuffers { Self(value.into()) } } + +impl Deref for ViewBuffers { + type Target = [Buffer]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} From cd4bf18611132b232643aea2b17b0a231c0cd4b3 Mon Sep 17 00:00:00 2001 From: ctsk <9384305+ctsk@users.noreply.github.com> Date: Wed, 25 Jun 2025 23:13:47 +0200 Subject: [PATCH 08/10] Clean up ArrayData::from(GenericByteViewArray) --- arrow-array/src/array/byte_view_array.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 50ff6ccdfe15..04566a4d6c8f 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -748,9 +748,7 @@ impl From> for ArrayData { let new_buffers = { let mut buffers = Vec::with_capacity(array.buffers.len() + 1); buffers.push(array.views.into_inner()); - for buffer in array.buffers.iter() { - buffers.push(buffer.clone()); - } + buffers.extend_from_slice(&array.buffers.0); buffers }; From cf1947703578cc3b7e1d65518200c836355d8b2c Mon Sep 17 00:00:00 2001 From: ctsk <9384305+ctsk@users.noreply.github.com> Date: Thu, 26 Jun 2025 08:57:38 +0200 Subject: [PATCH 09/10] Mini clean-up --- arrow-array/src/array/byte_view_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 04566a4d6c8f..6c54f88c5f3e 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -748,7 +748,7 @@ impl From> for ArrayData { let new_buffers = { let mut buffers = Vec::with_capacity(array.buffers.len() + 1); buffers.push(array.views.into_inner()); - buffers.extend_from_slice(&array.buffers.0); + buffers.extend_from_slice(&array.buffers); buffers }; From 180636b216bdb24276438cf7d2efb88eb984cd12 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 27 Jun 2025 08:16:14 -0400 Subject: [PATCH 10/10] try using Arc> --- arrow-array/src/array/byte_view_array.rs | 17 +++++++--------- arrow-array/src/view_buffers.rs | 25 ++++++++++++++++-------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 6510b1b0c285..88ab58ce08b4 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -617,9 +617,10 @@ impl Array for GenericByteViewArray { fn shrink_to_fit(&mut self) { self.views.shrink_to_fit(); - if let Some(buffers) = Arc::get_mut(&mut self.buffers.0) { - buffers.iter_mut().for_each(|b| b.shrink_to_fit()); - } + self.buffers + .make_mut() + .iter_mut() + .for_each(|b| b.shrink_to_fit()); if let Some(nulls) = &mut self.nulls { nulls.shrink_to_fit(); } @@ -747,16 +748,12 @@ where impl From> for ArrayData { fn from(array: GenericByteViewArray) -> Self { let len = array.len(); - let new_buffers = { - let mut buffers = Vec::with_capacity(array.buffers.len() + 1); - buffers.push(array.views.into_inner()); - buffers.extend_from_slice(&array.buffers); - buffers - }; + let mut buffers = array.buffers.unwrap_or_clone(); + buffers.insert(0, array.views.into_inner()); let builder = ArrayDataBuilder::new(T::DATA_TYPE) .len(len) - .buffers(new_buffers) + .buffers(buffers) .nulls(array.nulls); unsafe { builder.build_unchecked() } diff --git a/arrow-array/src/view_buffers.rs b/arrow-array/src/view_buffers.rs index 98a3c62b782b..e0ec6938a75a 100644 --- a/arrow-array/src/view_buffers.rs +++ b/arrow-array/src/view_buffers.rs @@ -23,11 +23,26 @@ use arrow_buffer::Buffer; /// /// Similar to `Arc>` or `Arc<[Buffer]>` #[derive(Clone, Debug)] -pub struct ViewBuffers(pub(crate) Arc<[Buffer]>); +pub struct ViewBuffers(Arc>); + +impl ViewBuffers { + /// Return a mutable reference to the underlying buffers, copying the buffers if necessary. + pub fn make_mut(&mut self) -> &mut Vec { + // If the underlying Arc is unique, we can mutate it in place + Arc::make_mut(&mut self.0) + } + + /// Convertes this ViewBuffers into a Vec, cloning the underlying buffers if + /// they are shared. + pub fn unwrap_or_clone(self) -> Vec { + Arc::unwrap_or_clone(self.0) + } +} impl FromIterator for ViewBuffers { fn from_iter>(iter: T) -> Self { - Self(iter.into_iter().collect()) + let v: Vec<_> = iter.into_iter().collect(); + Self(v.into()) } } @@ -37,12 +52,6 @@ impl From> for ViewBuffers { } } -impl From<&[Buffer]> for ViewBuffers { - fn from(value: &[Buffer]) -> Self { - Self(value.into()) - } -} - impl Deref for ViewBuffers { type Target = [Buffer];