diff --git a/arrow-array/benches/view_types.rs b/arrow-array/benches/view_types.rs index 929a97551632..5a6d86e170bf 100644 --- a/arrow-array/benches/view_types.rs +++ b/arrow-array/benches/view_types.rs @@ -48,6 +48,12 @@ fn criterion_benchmark(c: &mut Criterion) { black_box(array.slice(0, 100_000 / 2)); }); }); + + c.bench_function("view types slice", |b| { + b.iter(|| { + black_box(array.slice(0, 100_000 / 2)); + }); + }); } criterion_group!(benches, criterion_benchmark); diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 44df00aeb3cb..be1a611481a9 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -20,7 +20,9 @@ use crate::builder::{ArrayBuilder, GenericByteViewBuilder}; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; -use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar}; +use crate::{ + Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar, ViewBuffers, +}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN}; use arrow_schema::{ArrowError, DataType}; @@ -164,7 +166,7 @@ use super::ByteArrayType; pub struct GenericByteViewArray { data_type: DataType, views: ScalarBuffer, - buffers: Vec, + buffers: ViewBuffers, phantom: PhantomData, nulls: Option, } @@ -187,7 +189,11 @@ impl GenericByteViewArray { /// # Panics /// /// Panics if [`GenericByteViewArray::try_new`] returns an error - pub fn new(views: ScalarBuffer, buffers: Vec, nulls: Option) -> Self { + pub fn new( + views: ScalarBuffer, + buffers: impl Into, + nulls: Option, + ) -> Self { Self::try_new(views, buffers, nulls).unwrap() } @@ -199,9 +205,11 @@ impl GenericByteViewArray { /// * [ByteViewType::validate] fails pub fn try_new( views: ScalarBuffer, - buffers: Vec, + buffers: impl Into, nulls: Option, ) -> Result { + let buffers = buffers.into(); + T::validate(&views, &buffers)?; if let Some(n) = nulls.as_ref() { @@ -231,7 +239,7 @@ impl GenericByteViewArray { /// Safe if [`Self::try_new`] would not error pub unsafe fn new_unchecked( views: ScalarBuffer, - buffers: Vec, + buffers: impl Into, nulls: Option, ) -> Self { if cfg!(feature = "force_validate") { @@ -242,7 +250,7 @@ impl GenericByteViewArray { data_type: T::DATA_TYPE, phantom: Default::default(), views, - buffers, + buffers: buffers.into(), nulls, } } @@ -252,7 +260,7 @@ impl GenericByteViewArray { Self { data_type: T::DATA_TYPE, views: vec![0; len].into(), - buffers: vec![], + buffers: vec![].into(), nulls: Some(NullBuffer::new_null(len)), phantom: Default::default(), } @@ -278,7 +286,7 @@ impl GenericByteViewArray { } /// Deconstruct this array into its constituent parts - pub fn into_parts(self) -> (ScalarBuffer, Vec, Option) { + pub fn into_parts(self) -> (ScalarBuffer, ViewBuffers, Option) { (self.views, self.buffers, self.nulls) } @@ -609,8 +617,9 @@ impl Array for GenericByteViewArray { fn shrink_to_fit(&mut self) { self.views.shrink_to_fit(); - self.buffers.iter_mut().for_each(|b| b.shrink_to_fit()); - self.buffers.shrink_to_fit(); + if let Some(buffers) = Arc::get_mut(&mut self.buffers.0) { + buffers.iter_mut().for_each(|b| b.shrink_to_fit()); + } if let Some(nulls) = &mut self.nulls { nulls.shrink_to_fit(); } @@ -668,11 +677,11 @@ impl From for GenericByteViewArray { fn from(value: ArrayData) -> Self { let views = value.buffers()[0].clone(); let views = ScalarBuffer::new(views, value.offset(), value.len()); - let buffers = value.buffers()[1..].to_vec(); + let buffers = &value.buffers()[1..]; Self { data_type: T::DATA_TYPE, views, - buffers, + buffers: buffers.into(), nulls: value.nulls().cloned(), phantom: Default::default(), } @@ -736,12 +745,18 @@ where } impl From> for ArrayData { - fn from(mut array: GenericByteViewArray) -> Self { + fn from(array: GenericByteViewArray) -> Self { let len = array.len(); - array.buffers.insert(0, array.views.into_inner()); + let new_buffers = { + let mut buffers = Vec::with_capacity(array.buffers.len() + 1); + buffers.push(array.views.into_inner()); + buffers.extend_from_slice(&array.buffers); + buffers + }; + let builder = ArrayDataBuilder::new(T::DATA_TYPE) .len(len) - .buffers(array.buffers) + .buffers(new_buffers) .nulls(array.nulls); unsafe { builder.build_unchecked() } diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 91696540d219..fd38e59900d6 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -259,6 +259,8 @@ pub mod temporal_conversions; pub mod timezone; mod trusted_len; pub mod types; +mod view_buffers; +pub use view_buffers::ViewBuffers; #[cfg(test)] mod tests { diff --git a/arrow-array/src/view_buffers.rs b/arrow-array/src/view_buffers.rs new file mode 100644 index 000000000000..98a3c62b782b --- /dev/null +++ b/arrow-array/src/view_buffers.rs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ops::Deref, sync::Arc}; + +use arrow_buffer::Buffer; + +/// A cheaply cloneable, owned slice of [`Buffer`] +/// +/// Similar to `Arc>` or `Arc<[Buffer]>` +#[derive(Clone, Debug)] +pub struct ViewBuffers(pub(crate) Arc<[Buffer]>); + +impl FromIterator for ViewBuffers { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl From> for ViewBuffers { + fn from(value: Vec) -> Self { + Self(value.into()) + } +} + +impl From<&[Buffer]> for ViewBuffers { + fn from(value: &[Buffer]) -> Self { + Self(value.into()) + } +} + +impl Deref for ViewBuffers { + type Target = [Buffer]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} diff --git a/arrow/benches/concatenate_kernel.rs b/arrow/benches/concatenate_kernel.rs index a15c82001f67..508e1ee4b29c 100644 --- a/arrow/benches/concatenate_kernel.rs +++ b/arrow/benches/concatenate_kernel.rs @@ -39,18 +39,20 @@ fn bench_concat_arrays(arrays: &[&dyn Array]) { fn add_benchmark(c: &mut Criterion) { let v1 = create_primitive_array::(1024, 0.0); let v2 = create_primitive_array::(1024, 0.0); - c.bench_function("concat i32 1024", |b| b.iter(|| bench_concat(&v1, &v2))); + c.bench_function("concat i32 1024", |b| { + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) + }); let v1 = create_primitive_array::(1024, 0.5); let v2 = create_primitive_array::(1024, 0.5); c.bench_function("concat i32 nulls 1024", |b| { - b.iter(|| bench_concat(&v1, &v2)) + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) }); let small_array = create_primitive_array::(4, 0.0); let arrays: Vec<_> = (0..1024).map(|_| &small_array as &dyn Array).collect(); c.bench_function("concat 1024 arrays i32 4", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); { @@ -59,7 +61,7 @@ fn add_benchmark(c: &mut Criterion) { .collect::>(); let arrays: Vec<_> = input.iter().map(|arr| arr as &dyn Array).collect(); c.bench_function("concat i32 8192 over 100 arrays", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); } @@ -69,24 +71,26 @@ fn add_benchmark(c: &mut Criterion) { .collect::>(); let arrays: Vec<_> = input.iter().map(|arr| arr as &dyn Array).collect(); c.bench_function("concat i32 nulls 8192 over 100 arrays", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); } let v1 = create_boolean_array(1024, 0.0, 0.5); let v2 = create_boolean_array(1024, 0.0, 0.5); - c.bench_function("concat boolean 1024", |b| b.iter(|| bench_concat(&v1, &v2))); + c.bench_function("concat boolean 1024", |b| { + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) + }); let v1 = create_boolean_array(1024, 0.5, 0.5); let v2 = create_boolean_array(1024, 0.5, 0.5); c.bench_function("concat boolean nulls 1024", |b| { - b.iter(|| bench_concat(&v1, &v2)) + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) }); let small_array = create_boolean_array(4, 0.0, 0.5); let arrays: Vec<_> = (0..1024).map(|_| &small_array as &dyn Array).collect(); c.bench_function("concat 1024 arrays boolean 4", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); { @@ -95,7 +99,7 @@ fn add_benchmark(c: &mut Criterion) { .collect::>(); let arrays: Vec<_> = input.iter().map(|arr| arr as &dyn Array).collect(); c.bench_function("concat boolean 8192 over 100 arrays", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); } @@ -105,24 +109,26 @@ fn add_benchmark(c: &mut Criterion) { .collect::>(); let arrays: Vec<_> = input.iter().map(|arr| arr as &dyn Array).collect(); c.bench_function("concat boolean nulls 8192 over 100 arrays", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); } let v1 = create_string_array::(1024, 0.0); let v2 = create_string_array::(1024, 0.0); - c.bench_function("concat str 1024", |b| b.iter(|| bench_concat(&v1, &v2))); + c.bench_function("concat str 1024", |b| { + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) + }); let v1 = create_string_array::(1024, 0.5); let v2 = create_string_array::(1024, 0.5); c.bench_function("concat str nulls 1024", |b| { - b.iter(|| bench_concat(&v1, &v2)) + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) }); let small_array = create_string_array::(4, 0.0); let arrays: Vec<_> = (0..1024).map(|_| &small_array as &dyn Array).collect(); c.bench_function("concat 1024 arrays str 4", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); { @@ -131,7 +137,7 @@ fn add_benchmark(c: &mut Criterion) { .collect::>(); let arrays: Vec<_> = input.iter().map(|arr| arr as &dyn Array).collect(); c.bench_function("concat str 8192 over 100 arrays", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); } @@ -141,7 +147,7 @@ fn add_benchmark(c: &mut Criterion) { .collect::>(); let arrays: Vec<_> = input.iter().map(|arr| arr as &dyn Array).collect(); c.bench_function("concat str nulls 8192 over 100 arrays", |b| { - b.iter(|| bench_concat_arrays(&arrays)) + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) }); } @@ -155,7 +161,9 @@ fn add_benchmark(c: &mut Criterion) { let id = format!( "concat utf8_view {name} max_str_len={str_len} null_density={null_density}" ); - c.bench_function(&id, |b| b.iter(|| bench_concat_arrays(&arrays))); + c.bench_function(&id, |b| { + b.iter_with_large_drop(|| bench_concat_arrays(&arrays)) + }); } } @@ -164,7 +172,7 @@ fn add_benchmark(c: &mut Criterion) { let v2 = create_string_array_with_len::(10, 0.0, 20); let v2 = create_dict_from_values::(1024, 0.0, &v2); c.bench_function("concat str_dict 1024", |b| { - b.iter(|| bench_concat(&v1, &v2)) + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) }); let v1 = create_string_array_with_len::(1024, 0.0, 20); @@ -172,7 +180,7 @@ fn add_benchmark(c: &mut Criterion) { let v2 = create_string_array_with_len::(1024, 0.0, 20); let v2 = create_sparse_dict_from_values::(1024, 0.0, &v2, 30..40); c.bench_function("concat str_dict_sparse 1024", |b| { - b.iter(|| bench_concat(&v1, &v2)) + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) }); let v1 = FixedSizeListArray::try_new( @@ -190,7 +198,7 @@ fn add_benchmark(c: &mut Criterion) { ) .unwrap(); c.bench_function("concat fixed size lists", |b| { - b.iter(|| bench_concat(&v1, &v2)) + b.iter_with_large_drop(|| bench_concat(&v1, &v2)) }); { @@ -233,7 +241,7 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function( &format!("concat struct with int32 and dicts size={batch_size} count={batch_count}"), - |b| b.iter(|| bench_concat_arrays(&array_refs)), + |b| b.iter_with_large_drop(|| bench_concat_arrays(&array_refs)), ); } }