diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs index a8fca67197c..c4ca612e24c 100644 --- a/rust/arrow/src/array/array_binary.rs +++ b/rust/arrow/src/array/array_binary.rs @@ -25,8 +25,7 @@ use std::{ use super::{ array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, ArrayDataRef, - FixedSizeListArray, GenericBinaryIter, GenericListArray, LargeListArray, ListArray, - OffsetSizeTrait, + FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait, }; use crate::util::bit_util; use crate::{buffer::Buffer, datatypes::ToByteSlice}; @@ -105,7 +104,7 @@ impl GenericBinaryArray { let mut length_so_far: OffsetSize = OffsetSize::zero(); offsets.push(length_so_far); for s in &v { - length_so_far = length_so_far + OffsetSize::from_usize(s.len()).unwrap(); + length_so_far += OffsetSize::from_usize(s.len()).unwrap(); offsets.push(length_so_far); values.extend_from_slice(s); } @@ -236,8 +235,7 @@ where if let Some(s) = s { let s = s.as_ref(); bit_util::set_bit(null_slice, i); - length_so_far = - length_so_far + OffsetSize::from_usize(s.len()).unwrap(); + length_so_far += OffsetSize::from_usize(s.len()).unwrap(); values.extend_from_slice(s); } // always add an element in offsets @@ -294,15 +292,9 @@ impl From>> for LargeBinaryArray { } } -impl From for BinaryArray { - fn from(v: ListArray) -> Self { - BinaryArray::from_list(v) - } -} - -impl From for LargeBinaryArray { - fn from(v: LargeListArray) -> Self { - LargeBinaryArray::from_list(v) +impl From> for GenericBinaryArray { + fn from(v: GenericListArray) -> Self { + GenericBinaryArray::::from_list(v) } } @@ -633,7 +625,10 @@ impl Array for DecimalArray { #[cfg(test)] mod tests { - use crate::datatypes::Field; + use crate::{ + array::{LargeListArray, ListArray}, + datatypes::Field, + }; use super::*; diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs index 5af27769bfa..75b8a4827e5 100644 --- a/rust/arrow/src/array/array_list.rs +++ b/rust/arrow/src/array/array_list.rs @@ -30,7 +30,7 @@ use crate::datatypes::ArrowNativeType; use crate::datatypes::DataType; /// trait declaring an offset size, relevant for i32 vs i64 array types. -pub trait OffsetSizeTrait: ArrowNativeType + Num + Ord { +pub trait OffsetSizeTrait: ArrowNativeType + Num + Ord + std::ops::AddAssign { fn prefix() -> &'static str; fn to_isize(&self) -> isize; diff --git a/rust/arrow/src/array/array_string.rs b/rust/arrow/src/array/array_string.rs index 9dde1c6bc61..43e5a57ad93 100644 --- a/rust/arrow/src/array/array_string.rs +++ b/rust/arrow/src/array/array_string.rs @@ -22,7 +22,7 @@ use std::{any::Any, iter::FromIterator}; use super::{ array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, ArrayDataRef, - GenericListArray, GenericStringIter, LargeListArray, ListArray, OffsetSizeTrait, + GenericListArray, GenericStringIter, OffsetSizeTrait, }; use crate::util::bit_util; use crate::{buffer::Buffer, datatypes::ToByteSlice}; @@ -128,7 +128,7 @@ impl GenericStringArray { let mut length_so_far = OffsetSize::zero(); offsets.push(length_so_far); for s in &v { - length_so_far = length_so_far + OffsetSize::from_usize(s.len()).unwrap(); + length_so_far += OffsetSize::from_usize(s.len()).unwrap(); offsets.push(length_so_far); values.extend_from_slice(s.as_bytes()); } @@ -168,7 +168,7 @@ where let null_slice = null_buf.as_slice_mut(); bit_util::set_bit(null_slice, i); - length_so_far = length_so_far + OffsetSize::from_usize(s.len()).unwrap(); + length_so_far += OffsetSize::from_usize(s.len()).unwrap(); offsets.push(length_so_far); values.extend_from_slice(s.as_bytes()); } else { @@ -269,15 +269,9 @@ pub type StringArray = GenericStringArray; /// whose maximum length (in bytes) is represented by a i64. pub type LargeStringArray = GenericStringArray; -impl From for StringArray { - fn from(v: ListArray) -> Self { - StringArray::from_list(v) - } -} - -impl From for LargeStringArray { - fn from(v: LargeListArray) -> Self { - LargeStringArray::from_list(v) +impl From> for GenericStringArray { + fn from(v: GenericListArray) -> Self { + GenericStringArray::::from_list(v) } } diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 95178dacf4b..fd1e3422fb3 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -674,14 +674,14 @@ impl PrimitiveBuilder { /// Array builder for `ListArray` #[derive(Debug)] -pub struct ListBuilder { - offsets_builder: BufferBuilder, +pub struct GenericListBuilder { + offsets_builder: BufferBuilder, bitmap_builder: BooleanBufferBuilder, values_builder: T, - len: usize, + len: OffsetSize, } -impl ListBuilder { +impl GenericListBuilder { /// Creates a new `ListArrayBuilder` from a given values array builder pub fn new(values_builder: T) -> Self { let capacity = values_builder.len(); @@ -691,18 +691,20 @@ impl ListBuilder { /// Creates a new `ListArrayBuilder` from a given values array builder /// `capacity` is the number of items to pre-allocate space for in this builder pub fn with_capacity(values_builder: T, capacity: usize) -> Self { - let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); - offsets_builder.append(0); + let mut offsets_builder = BufferBuilder::::new(capacity + 1); + let len = OffsetSize::zero(); + offsets_builder.append(len); Self { offsets_builder, bitmap_builder: BooleanBufferBuilder::new(capacity), values_builder, - len: 0, + len, } } } -impl ArrayBuilder for ListBuilder +impl ArrayBuilder + for GenericListBuilder where T: 'static, { @@ -723,12 +725,12 @@ where /// Returns the number of array slots in the builder fn len(&self) -> usize { - self.len + self.len.to_usize().unwrap() } /// Returns whether the number of array slots is zero fn is_empty(&self) -> bool { - self.len == 0 + self.len == OffsetSize::zero() } /// Builds the array and reset this builder. @@ -737,7 +739,7 @@ where } } -impl ListBuilder +impl GenericListBuilder where T: 'static, { @@ -752,16 +754,16 @@ where /// Finish the current variable-length list array slot pub fn append(&mut self, is_valid: bool) -> Result<()> { self.offsets_builder - .append(self.values_builder.len() as i32); + .append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); self.bitmap_builder.append(is_valid); - self.len += 1; + self.len += OffsetSize::one(); Ok(()) } /// Builds the `ListArray` and reset this builder. - pub fn finish(&mut self) -> ListArray { + pub fn finish(&mut self) -> GenericListArray { let len = self.len(); - self.len = 0; + self.len = OffsetSize::zero(); let values_arr = self .values_builder .as_any_mut() @@ -772,137 +774,30 @@ where let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.bitmap_builder.finish(); - self.offsets_builder.append(0); - let data = ArrayData::builder(DataType::List(Box::new(Field::new( + self.offsets_builder.append(self.len); + let field = Box::new(Field::new( "item", values_data.data_type().clone(), true, // TODO: find a consistent way of getting this - )))) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer) - .build(); - - ListArray::from(data) - } -} - -/// Array builder for `ListArray` -#[derive(Debug)] -pub struct LargeListBuilder { - offsets_builder: BufferBuilder, - bitmap_builder: BooleanBufferBuilder, - values_builder: T, - len: usize, -} - -impl LargeListBuilder { - /// Creates a new `LargeListArrayBuilder` from a given values array builder - pub fn new(values_builder: T) -> Self { - let capacity = values_builder.len(); - Self::with_capacity(values_builder, capacity) - } - - /// Creates a new `LargeListArrayBuilder` from a given values array builder - /// `capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(values_builder: T, capacity: usize) -> Self { - let mut offsets_builder = Int64BufferBuilder::new(capacity + 1); - offsets_builder.append(0); - Self { - offsets_builder, - bitmap_builder: BooleanBufferBuilder::new(capacity), - values_builder, - len: 0, - } - } -} - -impl ArrayBuilder for LargeListBuilder -where - T: 'static, -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len == 0 - } + )); + let data_type = if OffsetSize::prefix() == "Large" { + DataType::LargeList(field) + } else { + DataType::List(field) + }; + let data = ArrayData::builder(data_type) + .len(len) + .add_buffer(offset_buffer) + .add_child_data(values_data) + .null_bit_buffer(null_bit_buffer) + .build(); - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) + GenericListArray::::from(data) } } -impl LargeListBuilder -where - T: 'static, -{ - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to append values into the child array builder, - /// but you must call `append` to delimit each distinct list value. - pub fn values(&mut self) -> &mut T { - &mut self.values_builder - } - - /// Finish the current variable-length list array slot - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.offsets_builder - .append(self.values_builder.len() as i64); - self.bitmap_builder.append(is_valid); - self.len += 1; - Ok(()) - } - - /// Builds the `LargeListArray` and reset this builder. - pub fn finish(&mut self) -> LargeListArray { - let len = self.len(); - self.len = 0; - let values_arr = self - .values_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); - let values_data = values_arr.data(); - - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.bitmap_builder.finish(); - self.offsets_builder.append(0); - let data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( - "item", - values_data.data_type().clone(), - true, - )))) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer) - .build(); - - LargeListArray::from(data) - } -} +pub type ListBuilder = GenericListBuilder; +pub type LargeListBuilder = GenericListBuilder; /// Array builder for `ListArray` #[derive(Debug)] @@ -1032,24 +927,20 @@ where /// Array builder for `BinaryArray` #[derive(Debug)] -pub struct BinaryBuilder { - builder: ListBuilder, +pub struct GenericBinaryBuilder { + builder: GenericListBuilder, } -#[derive(Debug)] -pub struct LargeBinaryBuilder { - builder: LargeListBuilder, -} +pub type BinaryBuilder = GenericBinaryBuilder; +pub type LargeBinaryBuilder = GenericBinaryBuilder; #[derive(Debug)] -pub struct StringBuilder { - builder: ListBuilder, +pub struct GenericStringBuilder { + builder: GenericListBuilder, } -#[derive(Debug)] -pub struct LargeStringBuilder { - builder: LargeListBuilder, -} +pub type StringBuilder = GenericStringBuilder; +pub type LargeStringBuilder = GenericStringBuilder; #[derive(Debug)] pub struct FixedSizeBinaryBuilder { @@ -1063,71 +954,9 @@ pub struct DecimalBuilder { scale: usize, } -impl ArrayBuilder for BinaryBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl ArrayBuilder for LargeBinaryBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl ArrayBuilder for StringBuilder { +impl ArrayBuilder + for GenericBinaryBuilder +{ /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &Any { self @@ -1159,7 +988,9 @@ impl ArrayBuilder for StringBuilder { } } -impl ArrayBuilder for LargeStringBuilder { +impl ArrayBuilder + for GenericStringBuilder +{ /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &Any { self @@ -1187,7 +1018,8 @@ impl ArrayBuilder for LargeStringBuilder { /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) + let a = GenericStringBuilder::::finish(self); + Arc::new(a) } } @@ -1255,13 +1087,13 @@ impl ArrayBuilder for DecimalBuilder { } } -impl BinaryBuilder { - /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values +impl GenericBinaryBuilder { + /// Creates a new `GenericBinaryBuilder`, `capacity` is the number of bytes in the values /// array pub fn new(capacity: usize) -> Self { let values_builder = UInt8Builder::new(capacity); Self { - builder: ListBuilder::new(values_builder), + builder: GenericListBuilder::new(values_builder), } } @@ -1295,63 +1127,18 @@ impl BinaryBuilder { } /// Builds the `BinaryArray` and reset this builder. - pub fn finish(&mut self) -> BinaryArray { - BinaryArray::from(self.builder.finish()) + pub fn finish(&mut self) -> GenericBinaryArray { + GenericBinaryArray::::from(self.builder.finish()) } } -impl LargeBinaryBuilder { - /// Creates a new `LargeBinaryBuilder`, `capacity` is the number of bytes in the values - /// array - pub fn new(capacity: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); - Self { - builder: LargeListBuilder::new(values_builder), - } - } - - /// Appends a single byte value into the builder's values array. - /// - /// Note, when appending individual byte values you must call `append` to delimit each - /// distinct list value. - pub fn append_byte(&mut self, value: u8) -> Result<()> { - self.builder.values().append_value(value)?; - Ok(()) - } - - /// Appends a byte slice into the builder. - /// - /// Automatically calls the `append` method to delimit the slice appended in as a - /// distinct array element. - pub fn append_value(&mut self, value: &[u8]) -> Result<()> { - self.builder.values().append_slice(value)?; - self.builder.append(true)?; - Ok(()) - } - - /// Finish the current variable-length list array slot. - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.builder.append(is_valid) - } - - /// Append a null value to the array. - pub fn append_null(&mut self) -> Result<()> { - self.append(false) - } - - /// Builds the `LargeBinaryArray` and reset this builder. - pub fn finish(&mut self) -> LargeBinaryArray { - LargeBinaryArray::from(self.builder.finish()) - } -} - -impl StringBuilder { +impl GenericStringBuilder { /// Creates a new `StringBuilder`, /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder pub fn new(capacity: usize) -> Self { let values_builder = UInt8Builder::new(capacity); Self { - builder: ListBuilder::new(values_builder), + builder: GenericListBuilder::new(values_builder), } } @@ -1361,7 +1148,7 @@ impl StringBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { let values_builder = UInt8Builder::new(data_capacity); Self { - builder: ListBuilder::with_capacity(values_builder, item_capacity), + builder: GenericListBuilder::with_capacity(values_builder, item_capacity), } } @@ -1386,54 +1173,8 @@ impl StringBuilder { } /// Builds the `StringArray` and reset this builder. - pub fn finish(&mut self) -> StringArray { - StringArray::from(self.builder.finish()) - } -} - -impl LargeStringBuilder { - /// Creates a new `StringBuilder`, - /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder - pub fn new(capacity: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); - Self { - builder: LargeListBuilder::new(values_builder), - } - } - - /// Creates a new `StringBuilder`, - /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder - /// `item_capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { - let values_builder = UInt8Builder::new(data_capacity); - Self { - builder: LargeListBuilder::with_capacity(values_builder, item_capacity), - } - } - - /// Appends a string into the builder. - /// - /// Automatically calls the `append` method to delimit the string appended in as a - /// distinct array element. - pub fn append_value(&mut self, value: &str) -> Result<()> { - self.builder.values().append_slice(value.as_bytes())?; - self.builder.append(true)?; - Ok(()) - } - - /// Finish the current variable-length list array slot. - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.builder.append(is_valid) - } - - /// Append a null value to the array. - pub fn append_null(&mut self) -> Result<()> { - self.append(false) - } - - /// Builds the `LargeStringArray` and reset this builder. - pub fn finish(&mut self) -> LargeStringArray { - LargeStringArray::from(self.builder.finish()) + pub fn finish(&mut self) -> GenericStringArray { + GenericStringArray::::from(self.builder.finish()) } } diff --git a/rust/arrow/src/array/transform/list.rs b/rust/arrow/src/array/transform/list.rs index 300afa94bf4..8053513178e 100644 --- a/rust/arrow/src/array/transform/list.rs +++ b/rust/arrow/src/array/transform/list.rs @@ -73,7 +73,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { (start..start + len).for_each(|i| { if array.is_valid(i) { // compute the new offset - last_offset = last_offset + offsets[i + 1] - offsets[i]; + last_offset += offsets[i + 1] - offsets[i]; // append value child.extend( diff --git a/rust/arrow/src/array/transform/utils.rs b/rust/arrow/src/array/transform/utils.rs index c95912996f2..617db275193 100644 --- a/rust/arrow/src/array/transform/utils.rs +++ b/rust/arrow/src/array/transform/utils.rs @@ -57,7 +57,7 @@ pub(super) fn extend_offsets( offsets.windows(2).for_each(|offsets| { // compute the new offset let length = offsets[1] - offsets[0]; - last_offset = last_offset + length; + last_offset += length; buffer.extend_from_slice(last_offset.to_byte_slice()); }); } diff --git a/rust/arrow/src/array/transform/variable_size.rs b/rust/arrow/src/array/transform/variable_size.rs index dcd0ed6147f..e9143117e35 100644 --- a/rust/arrow/src/array/transform/variable_size.rs +++ b/rust/arrow/src/array/transform/variable_size.rs @@ -79,7 +79,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { if array.is_valid(i) { // compute the new offset let length = offsets[i + 1] - offsets[i]; - last_offset = last_offset + length; + last_offset += length; let length = length.to_usize().unwrap(); // append value diff --git a/rust/arrow/src/compute/kernels/substring.rs b/rust/arrow/src/compute/kernels/substring.rs index 66298e67722..4c9d1995feb 100644 --- a/rust/arrow/src/compute/kernels/substring.rs +++ b/rust/arrow/src/compute/kernels/substring.rs @@ -63,7 +63,7 @@ fn generic_substring( // .max(0) is not needed as it is guaranteed .min(offsets[i + 1] - start); // so we do not go beyond this entry - length_so_far = length_so_far + length; + length_so_far += length; new_offsets.push(length_so_far); diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs index e4dae6fd8ee..87c3d42e2ec 100644 --- a/rust/arrow/src/compute/util.rs +++ b/rust/arrow/src/compute/util.rs @@ -124,7 +124,7 @@ where })?; let start = offsets[ix]; let end = offsets[ix + 1]; - current_offset = current_offset + (end - start); + current_offset += end - start; new_offsets.push(current_offset); let mut curr = start; @@ -132,7 +132,7 @@ where // if start == end, this slot is empty while curr < end { values.push(Some(curr)); - curr = curr + OffsetType::Native::one(); + curr += OffsetType::Native::one(); } } else { new_offsets.push(current_offset); diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index 7ef418ca64a..0438439b6d9 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -871,12 +871,12 @@ impl Decoder { offsets.push(cur_offset); rows.iter().enumerate().for_each(|(i, v)| { if let Value::Array(a) = v { - cur_offset = cur_offset + OffsetSize::from_usize(a.len()).unwrap(); + cur_offset += OffsetSize::from_usize(a.len()).unwrap(); bit_util::set_bit(list_nulls.as_slice_mut(), i); } else if let Value::Null = v { // value is null, not incremented } else { - cur_offset = cur_offset + OffsetSize::one(); + cur_offset += OffsetSize::one(); } offsets.push(cur_offset); }); diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index c0e05d82bbf..632eee5af99 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -913,7 +913,7 @@ impl ArrayReader for ListArrayReader { offsets.push(cur_offset) } if def_levels[i] > 0 { - cur_offset = cur_offset + OffsetSize::one(); + cur_offset += OffsetSize::one(); } } offsets.push(cur_offset);