diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index f2b22507081d..3db073eb00f0 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -69,7 +69,7 @@ use std::sync::Arc; /// /// ``` /// # use arrow_array::Array; -/// # use arrow_array::builder::GenericByteBuilder; +/// # use arrow_array::builder::{GenericByteBuilder, ValuesBuilder}; /// # use arrow_array::types::Utf8Type; /// let mut builder = GenericByteBuilder::::new(); /// builder.append_value("hello"); diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 9d2d396a5266..879ec30d0ba7 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -16,7 +16,7 @@ // under the License. use crate::array::print_long_array; -use crate::builder::{ArrayBuilder, GenericByteViewBuilder}; +use crate::builder::{ArrayBuilder, GenericByteViewBuilder, ValuesBuilder}; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; @@ -841,7 +841,7 @@ impl From>> for StringViewArray { #[cfg(test)] mod tests { - use crate::builder::{BinaryViewBuilder, StringViewBuilder}; + use crate::builder::{BinaryViewBuilder, StringViewBuilder, ValuesBuilder}; use crate::{Array, BinaryViewArray, StringViewArray}; use arrow_buffer::{Buffer, ScalarBuffer}; use arrow_data::ByteView; diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index b340bf9a9065..79dbd814b387 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -23,7 +23,7 @@ use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; use crate::{ - builder::StringRunBuilder, + builder::{StringRunBuilder, ValuesBuilder}, make_array, run_iterator::RunArrayIter, types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}, diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index ed70e5744fff..597b02266d6f 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -156,7 +156,7 @@ pub type LargeStringArray = GenericStringArray; #[cfg(test)] mod tests { use super::*; - use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder}; + use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder, ValuesBuilder}; use crate::types::UInt8Type; use crate::Array; use arrow_buffer::Buffer; diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 0bf5658b297e..358130dcead8 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -23,7 +23,7 @@ use crate::{ ArrayRef, ArrowPrimitiveType, RunArray, }; -use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; +use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder, ValuesBuilder}; use arrow_buffer::ArrowNativeType; @@ -33,7 +33,7 @@ use arrow_buffer::ArrowNativeType; /// /// ``` /// -/// # use arrow_array::builder::GenericByteRunBuilder; +/// # use arrow_array::builder::{GenericByteRunBuilder, ValuesBuilder}; /// # use arrow_array::{GenericByteArray, BinaryArray}; /// # use arrow_array::types::{BinaryType, Int16Type}; /// # use arrow_array::{Array, Int16Array}; @@ -155,13 +155,11 @@ where } } -impl GenericByteRunBuilder -where - R: RunEndIndexType, - V: ByteArrayType, -{ +impl ValuesBuilder for GenericByteRunBuilder { + type Value = T::Native; + /// Appends optional value to the logical array encoded by the RunArray. - pub fn append_option(&mut self, input_value: Option>) { + fn append_option(&mut self, input_value: Option>) { match input_value { Some(value) => self.append_value(value), None => self.append_null(), @@ -169,7 +167,7 @@ where } /// Appends value to the logical array encoded by the RunArray. - pub fn append_value(&mut self, input_value: impl AsRef) { + fn append_value(&mut self, input_value: impl AsRef) { let value: &[u8] = input_value.as_ref().as_ref(); if !self.has_current_value { self.append_run_end(); @@ -184,7 +182,7 @@ where } /// Appends null to the logical array encoded by the RunArray. - pub fn append_null(&mut self) { + fn append_null(&mut self) { if self.has_current_value { self.append_run_end(); self.current_value.clear(); @@ -192,7 +190,13 @@ where } self.current_run_end_index += 1; } +} +impl GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, +{ /// Creates the RunArray and resets the builder. /// Panics if RunArray cannot be built. pub fn finish(&mut self) -> RunArray { @@ -305,7 +309,7 @@ where /// // Create a run-end encoded array with run-end indexes data type as `i16`. /// // The encoded values are Strings. /// -/// # use arrow_array::builder::StringRunBuilder; +/// # use arrow_array::builder::{StringRunBuilder, ValuesBuilder}; /// # use arrow_array::{Int16Array, StringArray}; /// # use arrow_array::types::Int16Type; /// # use arrow_array::cast::AsArray; @@ -341,7 +345,7 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// // Create a run-end encoded array with run-end indexes data type as `i16`. /// // The encoded data is binary values. /// -/// # use arrow_array::builder::BinaryRunBuilder; +/// # use arrow_array::builder::{BinaryRunBuilder, ValuesBuilder}; /// # use arrow_array::{BinaryArray, Int16Array}; /// # use arrow_array::cast::AsArray; /// # use arrow_array::types::Int16Type; diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index e2be96615b61..a090f8d25d63 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -24,6 +24,8 @@ use arrow_data::ArrayDataBuilder; use std::any::Any; use std::sync::Arc; +use super::ValuesBuilder; + /// Builder for [`GenericByteArray`] /// /// For building strings, see docs on [`GenericStringBuilder`]. @@ -86,49 +88,6 @@ impl GenericByteBuilder { T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow") } - /// Appends a value into the builder. - /// - /// See the [GenericStringBuilder] documentation for examples of - /// incrementally building string values with multiple `write!` calls. - /// - /// # Panics - /// - /// Panics if the resulting length of [`Self::values_slice`] would exceed - /// `T::Offset::MAX` bytes. - /// - /// For example, this can happen with [`StringArray`] or [`BinaryArray`] - /// where the total length of all values exceeds 2GB - /// - /// [`StringArray`]: crate::StringArray - /// [`BinaryArray`]: crate::BinaryArray - #[inline] - pub fn append_value(&mut self, value: impl AsRef) { - self.value_builder.append_slice(value.as_ref().as_ref()); - self.null_buffer_builder.append(true); - self.offsets_builder.append(self.next_offset()); - } - - /// Append an `Option` value into the builder. - /// - /// - A `None` value will append a null value. - /// - A `Some` value will append the value. - /// - /// See [`Self::append_value`] for more panic information. - #[inline] - pub fn append_option(&mut self, value: Option>) { - match value { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - /// Append a null value into the builder. - #[inline] - pub fn append_null(&mut self) { - self.null_buffer_builder.append(false); - self.offsets_builder.append(self.next_offset()); - } - /// Builds the [`GenericByteArray`] and reset this builder. pub fn finish(&mut self) -> GenericByteArray { let array_type = T::DATA_TYPE; @@ -228,6 +187,53 @@ impl ArrayBuilder for GenericByteBuilder { } } +impl ValuesBuilder for GenericByteBuilder { + type Value = T::Native; + + /// Appends a value into the builder. + /// + /// See the [GenericStringBuilder] documentation for examples of + /// incrementally building string values with multiple `write!` calls. + /// + /// # Panics + /// + /// Panics if the resulting length of [`Self::values_slice`] would exceed + /// `T::Offset::MAX` bytes. + /// + /// For example, this can happen with [`StringArray`] or [`BinaryArray`] + /// where the total length of all values exceeds 2GB + /// + /// [`StringArray`]: crate::StringArray + /// [`BinaryArray`]: crate::BinaryArray + #[inline] + fn append_value(&mut self, value: impl AsRef) { + self.value_builder.append_slice(value.as_ref().as_ref()); + self.null_buffer_builder.append(true); + self.offsets_builder.append(self.next_offset()); + } + + /// Append an `Option` value into the builder. + /// + /// - A `None` value will append a null value. + /// - A `Some` value will append the value. + /// + /// See [`Self::append_value`] for more panic information. + #[inline] + fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + + /// Append a null value into the builder. + #[inline] + fn append_null(&mut self) { + self.null_buffer_builder.append(false); + self.offsets_builder.append(self.next_offset()); + } +} + impl> Extend> for GenericByteBuilder { #[inline] fn extend>>(&mut self, iter: I) { @@ -249,7 +255,7 @@ impl> Extend> for GenericByteBui /// /// # Example writing strings with `append_value` /// ``` -/// # use arrow_array::builder::GenericStringBuilder; +/// # use arrow_array::builder::{GenericStringBuilder, ValuesBuilder}; /// let mut builder = GenericStringBuilder::::new(); /// /// // Write one string value @@ -267,7 +273,7 @@ impl> Extend> for GenericByteBui /// /// ``` /// # use std::fmt::Write; -/// # use arrow_array::builder::GenericStringBuilder; +/// # use arrow_array::builder::{GenericStringBuilder, ValuesBuilder}; /// let mut builder = GenericStringBuilder::::new(); /// /// // Write data in multiple `write!` calls @@ -302,7 +308,7 @@ impl std::fmt::Write for GenericStringBuilder { /// /// # Example /// ``` -/// # use arrow_array::builder::GenericBinaryBuilder; +/// # use arrow_array::builder::{GenericBinaryBuilder, ValuesBuilder}; /// let mut builder = GenericBinaryBuilder::::new(); /// /// // Write data @@ -321,7 +327,7 @@ impl std::fmt::Write for GenericStringBuilder { /// /// ``` /// # use std::io::Write; -/// # use arrow_array::builder::GenericBinaryBuilder; +/// # use arrow_array::builder::{GenericBinaryBuilder, ValuesBuilder}; /// let mut builder = GenericBinaryBuilder::::new(); /// /// // Write data in multiple `write_bytes` calls diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index ead151d5ceea..12136a13d2e0 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; +use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder, ValuesBuilder}; use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray}; use arrow_buffer::ArrowNativeType; diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 7268e751b149..db8de3da5149 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -25,7 +25,7 @@ use arrow_schema::ArrowError; use hashbrown::hash_table::Entry; use hashbrown::HashTable; -use crate::builder::ArrayBuilder; +use crate::builder::{ArrayBuilder, ValuesBuilder}; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{ArrayRef, GenericByteViewArray}; @@ -272,6 +272,56 @@ impl GenericByteViewBuilder { } } + /// Builds the [`GenericByteViewArray`] and reset this builder + pub fn finish(&mut self) -> GenericByteViewArray { + self.flush_in_progress(); + let completed = std::mem::take(&mut self.completed); + let len = self.views_builder.len(); + let views = ScalarBuffer::new(self.views_builder.finish(), 0, len); + let nulls = self.null_buffer_builder.finish(); + if let Some((ref mut ht, _)) = self.string_tracker.as_mut() { + ht.clear(); + } + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } + + /// Builds the [`GenericByteViewArray`] without resetting the builder + pub fn finish_cloned(&self) -> GenericByteViewArray { + let mut completed = self.completed.clone(); + if !self.in_progress.is_empty() { + completed.push(Buffer::from_slice_ref(&self.in_progress)); + } + let len = self.views_builder.len(); + let views = Buffer::from_slice_ref(self.views_builder.as_slice()); + let views = ScalarBuffer::new(views, 0, len); + let nulls = self.null_buffer_builder.finish_cloned(); + // SAFETY: valid by construction + unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } + } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } + + /// Return the allocated size of this builder in bytes, useful for memory accounting. + pub fn allocated_size(&self) -> usize { + let views = self.views_builder.capacity() * std::mem::size_of::(); + let null = self.null_buffer_builder.allocated_size(); + let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::(); + let in_progress = self.in_progress.capacity(); + let tracker = match &self.string_tracker { + Some((ht, _)) => ht.capacity() * std::mem::size_of::(), + None => 0, + }; + buffer_size + in_progress + tracker + views + null + } +} + +impl ValuesBuilder for GenericByteViewBuilder { + type Value = T::Native; + /// Appends a value into the builder /// /// # Panics @@ -280,7 +330,7 @@ impl GenericByteViewBuilder { /// - String buffer count exceeds `u32::MAX` /// - String length exceeds `u32::MAX` #[inline] - pub fn append_value(&mut self, value: impl AsRef) { + fn append_value(&mut self, value: impl AsRef) { let v: &[u8] = value.as_ref().as_ref(); let length: u32 = v.len().try_into().unwrap(); if length <= 12 { @@ -347,7 +397,7 @@ impl GenericByteViewBuilder { /// Append an `Option` value into the builder #[inline] - pub fn append_option(&mut self, value: Option>) { + fn append_option(&mut self, value: Option>) { match value { None => self.append_null(), Some(v) => self.append_value(v), @@ -356,56 +406,10 @@ impl GenericByteViewBuilder { /// Append a null value into the builder #[inline] - pub fn append_null(&mut self) { + fn append_null(&mut self) { self.null_buffer_builder.append_null(); self.views_builder.append(0); } - - /// Builds the [`GenericByteViewArray`] and reset this builder - pub fn finish(&mut self) -> GenericByteViewArray { - self.flush_in_progress(); - let completed = std::mem::take(&mut self.completed); - let len = self.views_builder.len(); - let views = ScalarBuffer::new(self.views_builder.finish(), 0, len); - let nulls = self.null_buffer_builder.finish(); - if let Some((ref mut ht, _)) = self.string_tracker.as_mut() { - ht.clear(); - } - // SAFETY: valid by construction - unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } - } - - /// Builds the [`GenericByteViewArray`] without resetting the builder - pub fn finish_cloned(&self) -> GenericByteViewArray { - let mut completed = self.completed.clone(); - if !self.in_progress.is_empty() { - completed.push(Buffer::from_slice_ref(&self.in_progress)); - } - let len = self.views_builder.len(); - let views = Buffer::from_slice_ref(self.views_builder.as_slice()); - let views = ScalarBuffer::new(views, 0, len); - let nulls = self.null_buffer_builder.finish_cloned(); - // SAFETY: valid by construction - unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } - } - - /// Returns the current null buffer as a slice - pub fn validity_slice(&self) -> Option<&[u8]> { - self.null_buffer_builder.as_slice() - } - - /// Return the allocated size of this builder in bytes, useful for memory accounting. - pub fn allocated_size(&self) -> usize { - let views = self.views_builder.capacity() * std::mem::size_of::(); - let null = self.null_buffer_builder.allocated_size(); - let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::(); - let in_progress = self.in_progress.capacity(); - let tracker = match &self.string_tracker { - Some((ht, _)) => ht.capacity() * std::mem::size_of::(), - None => 0, - }; - buffer_size + in_progress + tracker + views + null - } } impl Default for GenericByteViewBuilder { @@ -470,7 +474,7 @@ impl> Extend> /// /// # Example /// ``` -/// # use arrow_array::builder::StringViewBuilder; +/// # use arrow_array::builder::{StringViewBuilder, ValuesBuilder}; /// # use arrow_array::StringViewArray; /// let mut builder = StringViewBuilder::new(); /// builder.append_value("hello"); @@ -491,7 +495,7 @@ pub type StringViewBuilder = GenericByteViewBuilder; /// /// # Example /// ``` -/// # use arrow_array::builder::BinaryViewBuilder; +/// # use arrow_array::builder::{BinaryViewBuilder, ValuesBuilder}; /// use arrow_array::BinaryViewArray; /// let mut builder = BinaryViewBuilder::new(); /// builder.append_value("hello"); diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index a9c88ec6c586..c32624d46219 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -34,7 +34,7 @@ use std::sync::Arc; /// /// ``` /// # use std::sync::Arc; -/// # use arrow_array::{builder::ListBuilder, builder::StringBuilder, ArrayRef, StringArray, Array}; +/// # use arrow_array::{builder::{ListBuilder, StringBuilder, ValuesBuilder}, ArrayRef, StringArray, Array}; /// # /// let values_builder = StringBuilder::new(); /// let mut builder = ListBuilder::new(values_builder); diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 1d89d427aae1..17f826c12ac1 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -27,7 +27,7 @@ use std::sync::Arc; /// Builder for [`MapArray`] /// /// ``` -/// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; +/// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder, ValuesBuilder}; /// # use arrow_array::{Int32Array, StringArray}; /// /// let string_builder = StringBuilder::new(); @@ -262,7 +262,7 @@ impl ArrayBuilder for MapBuilder { #[cfg(test)] mod tests { - use crate::builder::{make_builder, Int32Builder, StringBuilder}; + use crate::builder::{make_builder, Int32Builder, StringBuilder, ValuesBuilder}; use crate::{Int32Array, StringArray}; use super::*; diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 89a96280eb87..e25041d2c3c8 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -22,7 +22,7 @@ //! Builders can be used to build simple, non-nested arrays //! //! ``` -//! # use arrow_array::builder::Int32Builder; +//! # use arrow_array::builder::{Int32Builder, ValuesBuilder}; //! # use arrow_array::PrimitiveArray; //! let mut a = Int32Builder::new(); //! a.append_value(1); @@ -34,7 +34,7 @@ //! ``` //! //! ``` -//! # use arrow_array::builder::StringBuilder; +//! # use arrow_array::builder::{StringBuilder, ValuesBuilder}; //! # use arrow_array::{Array, StringArray}; //! let mut a = StringBuilder::new(); //! a.append_value("foo"); @@ -50,7 +50,7 @@ //! Builders can also be used to build more complex nested arrays, such as lists //! //! ``` -//! # use arrow_array::builder::{Int32Builder, ListBuilder}; +//! # use arrow_array::builder::{Int32Builder, ListBuilder, ValuesBuilder}; //! # use arrow_array::ListArray; //! # use arrow_array::types::Int32Type; //! let mut a = ListBuilder::new(Int32Builder::new()); @@ -87,7 +87,7 @@ //! //! ``` //! # use std::any::Any; -//! # use arrow_array::builder::{ArrayBuilder, Int32Builder, ListBuilder, StringBuilder}; +//! # use arrow_array::builder::{ArrayBuilder, Int32Builder, ListBuilder, StringBuilder, ValuesBuilder}; //! # use arrow_array::{ArrayRef, RecordBatch, StructArray}; //! # use arrow_schema::{DataType, Field}; //! # use std::sync::Arc; @@ -194,7 +194,7 @@ use std::any::Any; /// ``` /// // Create /// # use arrow_array::{ArrayRef, StringArray}; -/// # use arrow_array::builder::{ArrayBuilder, Float64Builder, Int64Builder, StringBuilder}; +/// # use arrow_array::builder::{ArrayBuilder, Float64Builder, Int64Builder, StringBuilder, ValuesBuilder}; /// /// let mut data_builders: Vec> = vec![ /// Box::new(Float64Builder::new()), @@ -298,6 +298,22 @@ impl ArrayBuilder for Box { } } +/// Trait for common interfaces used by [`ArrayBuilder`]s +/// when building a list of values. +pub trait ValuesBuilder: ArrayBuilder + Send + Sync { + /// Type of value. + type Value: ?Sized; + + /// Append a null value. + fn append_null(&mut self); + + /// Append a non-null value. + fn append_value(&mut self, value: impl AsRef); + + /// Append a value, which may be null. + fn append_option(&mut self, value: Option>); +} + /// Builder for [`ListArray`](crate::array::ListArray) pub type ListBuilder = GenericListBuilder; diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 2b288445c74b..0dfc3c0047c6 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -40,7 +40,7 @@ use std::sync::Arc; /// For a practical example see the code below: /// /// ```rust -/// use arrow_array::builder::{ArrayBuilder, ListBuilder, StringBuilder, StructBuilder}; +/// use arrow_array::builder::{ArrayBuilder, ListBuilder, StringBuilder, StructBuilder, ValuesBuilder}; /// use arrow_schema::{DataType, Field, Fields}; /// use std::sync::Arc; /// diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 144f2a21afec..b6980857e701 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -1304,7 +1304,7 @@ mod tests_from_ffi { use arrow_schema::{DataType, Field}; use super::{ImportedArrowArray, Result}; - use crate::builder::GenericByteViewBuilder; + use crate::builder::{GenericByteViewBuilder, ValuesBuilder}; use crate::types::{BinaryViewType, ByteViewType, Int32Type, StringViewType}; use crate::{ array::{ diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 669b8a664c2b..88b286a7e15d 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -1056,7 +1056,7 @@ pub fn lexical_to_string(n: N) -> String { #[cfg(test)] mod tests { - use arrow_array::builder::StringRunBuilder; + use arrow_array::builder::{StringRunBuilder, ValuesBuilder}; use super::*; diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index c5a0a0b76d59..6ec450f31be1 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -419,7 +419,7 @@ mod tests { use crate::ReaderBuilder; use arrow_array::builder::{ BinaryBuilder, Decimal128Builder, Decimal256Builder, FixedSizeBinaryBuilder, - LargeBinaryBuilder, + LargeBinaryBuilder, ValuesBuilder, }; use arrow_array::types::*; use arrow_buffer::i256; diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 657298b4a8b3..d0565dd00b99 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -32,7 +32,7 @@ use tonic::transport::Server; use tonic::transport::{Certificate, Identity, ServerTlsConfig}; use tonic::{Request, Response, Status, Streaming}; -use arrow_array::builder::StringBuilder; +use arrow_array::builder::{StringBuilder, ValuesBuilder}; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::encode::FlightDataEncoderBuilder; use arrow_flight::sql::metadata::{ diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 315b7b3cb6e5..fb181ae1c579 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -734,6 +734,7 @@ mod tests { use crate::decode::{DecodedPayload, FlightDataDecoder}; use arrow_array::builder::{ GenericByteDictionaryBuilder, ListBuilder, StringDictionaryBuilder, StructBuilder, + ValuesBuilder, }; use arrow_array::*; use arrow_array::{cast::downcast_array, types::*}; diff --git a/arrow-flight/src/sql/metadata/db_schemas.rs b/arrow-flight/src/sql/metadata/db_schemas.rs index 303d11cd74ca..bd383b9eb72b 100644 --- a/arrow-flight/src/sql/metadata/db_schemas.rs +++ b/arrow-flight/src/sql/metadata/db_schemas.rs @@ -22,7 +22,10 @@ use std::sync::Arc; use arrow_arith::boolean::and; -use arrow_array::{builder::StringBuilder, ArrayRef, RecordBatch, StringArray}; +use arrow_array::{ + builder::{StringBuilder, ValuesBuilder}, + ArrayRef, RecordBatch, StringArray, +}; use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use arrow_select::{filter::filter_record_batch, take::take}; diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index 58b228530942..92e54f232edb 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -31,7 +31,7 @@ use arrow_arith::boolean::or; use arrow_array::array::{Array, UInt32Array, UnionArray}; use arrow_array::builder::{ ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, MapBuilder, - StringBuilder, UInt32Builder, + StringBuilder, UInt32Builder, ValuesBuilder, }; use arrow_array::{RecordBatch, Scalar}; use arrow_data::ArrayData; diff --git a/arrow-flight/src/sql/metadata/table_types.rs b/arrow-flight/src/sql/metadata/table_types.rs index 54cfe6fe27a7..448971fe7ad2 100644 --- a/arrow-flight/src/sql/metadata/table_types.rs +++ b/arrow-flight/src/sql/metadata/table_types.rs @@ -21,7 +21,10 @@ use std::sync::Arc; -use arrow_array::{builder::StringBuilder, ArrayRef, RecordBatch}; +use arrow_array::{ + builder::{StringBuilder, ValuesBuilder}, + ArrayRef, RecordBatch, +}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use arrow_select::take::take; use once_cell::sync::Lazy; diff --git a/arrow-flight/src/sql/metadata/tables.rs b/arrow-flight/src/sql/metadata/tables.rs index 7ffb76fa1d5f..a9b574ad169a 100644 --- a/arrow-flight/src/sql/metadata/tables.rs +++ b/arrow-flight/src/sql/metadata/tables.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use arrow_arith::boolean::{and, or}; -use arrow_array::builder::{BinaryBuilder, StringBuilder}; +use arrow_array::builder::{BinaryBuilder, StringBuilder, ValuesBuilder}; use arrow_array::{ArrayRef, RecordBatch, StringArray}; use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Schema, SchemaRef}; diff --git a/arrow-flight/src/sql/metadata/xdbc_info.rs b/arrow-flight/src/sql/metadata/xdbc_info.rs index a3a18ca10888..4001b3d711c7 100644 --- a/arrow-flight/src/sql/metadata/xdbc_info.rs +++ b/arrow-flight/src/sql/metadata/xdbc_info.rs @@ -26,7 +26,9 @@ //! use std::sync::Arc; -use arrow_array::builder::{BooleanBuilder, Int32Builder, ListBuilder, StringBuilder}; +use arrow_array::builder::{ + BooleanBuilder, Int32Builder, ListBuilder, StringBuilder, ValuesBuilder, +}; use arrow_array::{ArrayRef, Int32Array, ListArray, RecordBatch, Scalar}; use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Schema, SchemaRef}; diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 5ab4d09d5d63..44c54a1a9647 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::GenericStringBuilder; +use arrow_array::builder::{GenericStringBuilder, ValuesBuilder}; use arrow_array::{Array, GenericStringArray, OffsetSizeTrait}; use arrow_data::ArrayData; use arrow_schema::ArrowError; diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index bb82f54d4918..926b6664812e 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -117,6 +117,7 @@ mod tests { use arrow_array::builder::{ ListBuilder, PrimitiveDictionaryBuilder, StringBuilder, StringDictionaryBuilder, + ValuesBuilder, }; use arrow_array::types::*; use arrow_buffer::{i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano}; diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index f3893cd5bd13..8bd6e80c9059 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -21,13 +21,15 @@ use crate::like::StringArrayType; use arrow_array::builder::{ - BooleanBufferBuilder, GenericStringBuilder, ListBuilder, StringViewBuilder, + BooleanBufferBuilder, GenericStringBuilder, ListBuilder, StringViewBuilder, ValuesBuilder, }; use arrow_array::cast::AsArray; use arrow_array::*; use arrow_buffer::NullBuffer; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; +use builder::GenericListBuilder; +use iterator::ArrayIter; use regex::Regex; use std::collections::HashMap; @@ -245,71 +247,75 @@ where Ok(BooleanArray::from(data)) } -macro_rules! process_regexp_array_match { - ($array:expr, $regex_array:expr, $flags_array:expr, $list_builder:expr) => { - let mut patterns: HashMap = HashMap::new(); +fn process_regexp_array_match< + 'a, + T: ?Sized, + A: ArrayAccessor, + O: OffsetSizeTrait, + B: ValuesBuilder, +>( + array: ArrayIter, + regex_array: ArrayIter, + flags_array: Option>, + list_builder: &mut GenericListBuilder, +) -> Result<(), ArrowError> { + let mut patterns: HashMap = HashMap::new(); - let complete_pattern = match $flags_array { - Some(flags) => Box::new($regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(value) => format!("(?{value}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - $regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; + let complete_pattern = match flags_array { + Some(flags) => Box::new(regex_array.zip(flags).map(|(pattern, flags)| { + pattern.map(|pattern| match flags { + Some(value) => format!("(?{value}){pattern}"), + None => pattern.to_string(), + }) + })) as Box>>, + None => Box::new(regex_array.map(|pattern| pattern.map(|pattern| pattern.to_string()))), + }; - $array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - (Some(_), Some(pattern)) if pattern == *"" => { - $list_builder.values().append_value(""); - $list_builder.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.entry(pattern).or_insert(re) + array + .zip(complete_pattern) + .map(|(value, pattern)| { + match (value, pattern) { + // Required for Postgres compatibility: + // SELECT regexp_match('foobarbequebaz', ''); = {""} + (Some(_), Some(pattern)) if pattern == *"" => { + list_builder.values().append_value(""); + list_builder.append(true); + } + (Some(value), Some(pattern)) => { + let existing_pattern = patterns.get(&pattern); + let re = match existing_pattern { + Some(re) => re, + None => { + let re = Regex::new(pattern.as_str()).map_err(|e| { + ArrowError::ComputeError(format!( + "Regular expression did not compile: {e:?}" + )) + })?; + patterns.entry(pattern).or_insert(re) + } + }; + match re.captures(value) { + Some(caps) => { + let mut iter = caps.iter(); + if caps.len() > 1 { + iter.next(); } - }; - match re.captures(value) { - Some(caps) => { - let mut iter = caps.iter(); - if caps.len() > 1 { - iter.next(); - } - for m in iter.flatten() { - $list_builder.values().append_value(m.as_str()); - } - - $list_builder.append(true); + for m in iter.flatten() { + list_builder.values().append_value(m.as_str()); } - None => $list_builder.append(false), + + list_builder.append(true); } + None => list_builder.append(false), } - _ => $list_builder.append(false), } - Ok(()) - }) - .collect::, ArrowError>>()?; - }; + _ => list_builder.append(false), + } + Ok(()) + }) + .collect::, ArrowError>>()?; + + Ok(()) } fn regexp_array_match( @@ -320,7 +326,12 @@ fn regexp_array_match( let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); let mut list_builder = ListBuilder::new(builder); - process_regexp_array_match!(array, regex_array, flags_array, list_builder); + process_regexp_array_match( + array.iter(), + regex_array.iter(), + flags_array.map(|fa| fa.iter()), + &mut list_builder, + )?; Ok(Arc::new(list_builder.finish())) } @@ -333,7 +344,12 @@ fn regexp_array_match_utf8view( let builder = StringViewBuilder::with_capacity(0); let mut list_builder = ListBuilder::new(builder); - process_regexp_array_match!(array, regex_array, flags_array, list_builder); + process_regexp_array_match( + array.iter(), + regex_array.iter(), + flags_array.map(|fa| fa.iter()), + &mut list_builder, + )?; Ok(Arc::new(list_builder.finish())) } @@ -368,37 +384,46 @@ fn get_scalar_pattern_flag_utf8view<'a>( } } -macro_rules! process_regexp_match { - ($array:expr, $regex:expr, $list_builder:expr) => { - $array - .iter() - .map(|value| { - match value { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - Some(_) if $regex.as_str().is_empty() => { - $list_builder.values().append_value(""); - $list_builder.append(true); - } - Some(value) => match $regex.captures(value) { - Some(caps) => { - let mut iter = caps.iter(); - if caps.len() > 1 { - iter.next(); - } - for m in iter.flatten() { - $list_builder.values().append_value(m.as_str()); - } - $list_builder.append(true); - } - None => $list_builder.append(false), - }, - None => $list_builder.append(false), +fn process_regexp_match< + 'a, + T: ?Sized, + A: ArrayAccessor, + O: OffsetSizeTrait, + B: ValuesBuilder, +>( + array: ArrayIter, + regex: &Regex, + list_builder: &mut GenericListBuilder, +) -> Result<(), ArrowError> { + array + .map(|value| { + match value { + // Required for Postgres compatibility: + // SELECT regexp_match('foobarbequebaz', ''); = {""} + Some(_) if regex.as_str().is_empty() => { + list_builder.values().append_value(""); + list_builder.append(true); } - Ok(()) - }) - .collect::, ArrowError>>()? - }; + Some(value) => match regex.captures(value) { + Some(caps) => { + let mut iter = caps.iter(); + if caps.len() > 1 { + iter.next(); + } + for m in iter.flatten() { + list_builder.values().append_value(m.as_str()); + } + list_builder.append(true); + } + None => list_builder.append(false), + }, + None => list_builder.append(false), + } + Ok(()) + }) + .collect::, ArrowError>>()?; + + Ok(()) } fn regexp_scalar_match( @@ -408,7 +433,7 @@ fn regexp_scalar_match( let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); let mut list_builder = ListBuilder::new(builder); - process_regexp_match!(array, regex, list_builder); + process_regexp_match(array.iter(), regex, &mut list_builder)?; Ok(Arc::new(list_builder.finish())) } @@ -420,7 +445,7 @@ fn regexp_scalar_match_utf8view( let builder = StringViewBuilder::with_capacity(0); let mut list_builder = ListBuilder::new(builder); - process_regexp_match!(array, regex, list_builder); + process_regexp_match(array.iter(), regex, &mut list_builder)?; Ok(Arc::new(list_builder.finish())) } diff --git a/arrow/benches/string_run_builder.rs b/arrow/benches/string_run_builder.rs index b4457b74dada..6c96d7c36bfc 100644 --- a/arrow/benches/string_run_builder.rs +++ b/arrow/benches/string_run_builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::StringRunBuilder; +use arrow::array::{StringRunBuilder, ValuesBuilder}; use arrow::datatypes::Int32Type; use arrow::util::bench_util::create_string_array_for_runs; use criterion::{criterion_group, criterion_main, Criterion}; diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 94fb85030bf3..24feedc13172 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -19,7 +19,7 @@ use arrow::array::{ make_array, Array, ArrayRef, BooleanArray, Decimal128Array, FixedSizeBinaryArray, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, GenericStringArray, Int32Array, Int32Builder, Int64Builder, ListArray, ListBuilder, NullArray, OffsetSizeTrait, - StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, + StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, ValuesBuilder, }; use arrow::datatypes::{Int16Type, Int32Type}; use arrow_array::builder::{StringBuilder, StringViewBuilder, StructBuilder}; diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index c6de9f4a3417..974fb2e1b8e3 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -19,7 +19,7 @@ use arrow::array::{ Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, FixedSizeBinaryArray, FixedSizeListBuilder, Int16Array, Int32Array, Int64Array, Int64Builder, ListArray, ListBuilder, MapBuilder, NullArray, StringArray, StringBuilder, StringDictionaryBuilder, StructArray, - UInt16Array, UInt16Builder, UInt8Array, UnionArray, + UInt16Array, UInt16Builder, UInt8Array, UnionArray, ValuesBuilder, }; use arrow::datatypes::Int16Type; use arrow_array::StringViewArray; diff --git a/arrow/tests/pyarrow.rs b/arrow/tests/pyarrow.rs index d9ebd0daa1cd..f9c69f949568 100644 --- a/arrow/tests/pyarrow.rs +++ b/arrow/tests/pyarrow.rs @@ -18,7 +18,7 @@ use arrow::array::{ArrayRef, Int32Array, StringArray}; use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use arrow::record_batch::RecordBatch; -use arrow_array::builder::{BinaryViewBuilder, StringViewBuilder}; +use arrow_array::builder::{BinaryViewBuilder, StringViewBuilder, ValuesBuilder}; use arrow_array::{Array, BinaryViewArray, StringViewArray}; use pyo3::Python; use std::sync::Arc; diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index 4bdec602ba4f..cfb693a114e9 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -126,7 +126,7 @@ mod tests { use crate::arrow::arrow_reader::ParquetRecordBatchReader; use crate::arrow::ArrowWriter; use arrow::datatypes::{Field, Int32Type, Schema}; - use arrow_array::builder::{MapBuilder, PrimitiveBuilder, StringBuilder}; + use arrow_array::builder::{MapBuilder, PrimitiveBuilder, StringBuilder, ValuesBuilder}; use arrow_array::cast::*; use arrow_array::RecordBatch; use arrow_schema::Fields; diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 09f8ec7cc274..149b64904400 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -30,7 +30,7 @@ use crate::file::statistics::Statistics as ParquetStatistics; use crate::schema::types::SchemaDescriptor; use arrow_array::builder::{ BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder, - StringViewBuilder, + StringViewBuilder, ValuesBuilder, }; use arrow_array::{ new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,