diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index f35c02aa3d0..3e8c3eab5cc 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -1265,34 +1265,7 @@ impl GenericBinaryArray { /// Creates a [GenericBinaryArray] from a vector of Optional (null) byte slices pub fn from_opt_vec(v: Vec>) -> Self { - let mut offsets = Vec::with_capacity(v.len() + 1); - let mut values = Vec::new(); - let mut null_buf = make_null_buffer(v.len()); - let mut length_so_far: OffsetSize = OffsetSize::zero(); - offsets.push(length_so_far); - - { - let null_slice = null_buf.data_mut(); - - for (i, s) in v.iter().enumerate() { - if let Some(s) = s { - bit_util::set_bit(null_slice, i); - length_so_far = - length_so_far + OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s); - } - // always add an element in offsets - offsets.push(length_so_far); - } - } - - let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(v.len()) - .add_buffer(Buffer::from(offsets.to_byte_slice())) - .add_buffer(Buffer::from(&values[..])) - .null_bit_buffer(null_buf.freeze()) - .build(); - GenericBinaryArray::::from(array_data) + v.into_iter().collect() } fn from_list(v: GenericListArray) -> Self { @@ -1323,6 +1296,13 @@ impl GenericBinaryArray { } } +impl<'a, T: BinaryOffsetSizeTrait> GenericBinaryArray { + /// constructs a new iterator + pub fn iter(&'a self) -> GenericBinaryIter<'a, T> { + GenericBinaryIter::<'a, T>::new(&self) + } +} + impl fmt::Debug for GenericBinaryArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}BinaryArray\n[\n", OffsetSize::prefix())?; @@ -1391,12 +1371,63 @@ impl From } } +impl FromIterator> + for GenericBinaryArray +where + Ptr: AsRef<[u8]>, +{ + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let (_, data_len) = iter.size_hint(); + let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. + + let mut offsets = Vec::with_capacity(data_len + 1); + let mut values = Vec::new(); + let mut null_buf = make_null_buffer(data_len); + let mut length_so_far: OffsetSize = OffsetSize::zero(); + offsets.push(length_so_far); + + { + let null_slice = null_buf.data_mut(); + + for (i, s) in iter.enumerate() { + if let Some(s) = s { + let s = s.as_ref(); + bit_util::set_bit(null_slice, i); + length_so_far = + length_so_far + OffsetSize::from_usize(s.len()).unwrap(); + values.extend_from_slice(s); + } + // always add an element in offsets + offsets.push(length_so_far); + } + } + + let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) + .len(data_len) + .add_buffer(Buffer::from(offsets.to_byte_slice())) + .add_buffer(Buffer::from(&values[..])) + .null_bit_buffer(null_buf.freeze()) + .build(); + Self::from(array_data) + } +} + /// An array where each element is a byte whose maximum length is represented by a i32. pub type BinaryArray = GenericBinaryArray; /// An array where each element is a byte whose maximum length is represented by a i64. pub type LargeBinaryArray = GenericBinaryArray; +impl<'a, T: BinaryOffsetSizeTrait> IntoIterator for &'a GenericBinaryArray { + type Item = Option<&'a [u8]>; + type IntoIter = GenericBinaryIter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + GenericBinaryIter::<'a, T>::new(self) + } +} + impl From> for BinaryArray { fn from(v: Vec<&[u8]>) -> Self { BinaryArray::from_vec(v) diff --git a/rust/arrow/src/array/iterator.rs b/rust/arrow/src/array/iterator.rs index fda77ffcbbb..9cab95c1ac4 100644 --- a/rust/arrow/src/array/iterator.rs +++ b/rust/arrow/src/array/iterator.rs @@ -17,7 +17,10 @@ use crate::datatypes::ArrowPrimitiveType; -use super::{Array, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait}; +use super::{ + array::BinaryOffsetSizeTrait, Array, GenericBinaryArray, GenericStringArray, + PrimitiveArray, StringOffsetSizeTrait, +}; /// an iterator that returns Some(T) or None, that can be used on any non-boolean PrimitiveArray #[derive(Debug)] @@ -111,11 +114,60 @@ impl<'a, T: StringOffsetSizeTrait> std::iter::ExactSizeIterator { } +/// an iterator that returns `Some(&[u8])` or `None`, for binary arrays +#[derive(Debug)] +pub struct GenericBinaryIter<'a, T> +where + T: BinaryOffsetSizeTrait, +{ + array: &'a GenericBinaryArray, + i: usize, + len: usize, +} + +impl<'a, T: BinaryOffsetSizeTrait> GenericBinaryIter<'a, T> { + /// create a new iterator + pub fn new(array: &'a GenericBinaryArray) -> Self { + GenericBinaryIter:: { + array, + i: 0, + len: array.len(), + } + } +} + +impl<'a, T: BinaryOffsetSizeTrait> std::iter::Iterator for GenericBinaryIter<'a, T> { + type Item = Option<&'a [u8]>; + + fn next(&mut self) -> Option { + let i = self.i; + if i >= self.len { + None + } else if self.array.is_null(i) { + self.i += 1; + Some(None) + } else { + self.i += 1; + Some(Some(self.array.value(i))) + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.len, Some(self.len)) + } +} + +/// all arrays have known size. +impl<'a, T: BinaryOffsetSizeTrait> std::iter::ExactSizeIterator + for GenericBinaryIter<'a, T> +{ +} + #[cfg(test)] mod tests { use std::sync::Arc; - use crate::array::{ArrayRef, Int32Array, StringArray}; + use crate::array::{ArrayRef, BinaryArray, Int32Array, StringArray}; #[test] fn test_primitive_array_iter_round_trip() { @@ -156,4 +208,20 @@ mod tests { StringArray::from(vec![Some("ab"), None, Some("aaab"), None, Some("aaaaab")]); assert_eq!(result, expected); } + + #[test] + fn test_binary_array_iter_round_trip() { + let array = BinaryArray::from(vec![ + Some(b"a" as &[u8]), + None, + Some(b"aaa"), + None, + Some(b"aaaaa"), + ]); + + // to and from iter + let result: BinaryArray = array.iter().collect(); + + assert_eq!(result, array); + } }