diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 980dd2a48979..6eb93e31c9ff 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -377,6 +377,18 @@ impl From> for ArrayData { } } +impl<'a, Ptr, T> FromIterator<&'a Option> for GenericByteViewArray +where + Ptr: AsRef + 'a, + T: ByteViewType + ?Sized, +{ + fn from_iter>>(iter: I) -> Self { + iter.into_iter() + .map(|o| o.as_ref().map(|p| p.as_ref())) + .collect() + } +} + impl FromIterator> for GenericByteViewArray where Ptr: AsRef, @@ -433,6 +445,24 @@ impl From> for StringViewArray { } } +impl From>> for StringViewArray { + fn from(v: Vec>) -> Self { + v.into_iter().collect() + } +} + +impl From> for StringViewArray { + fn from(v: Vec) -> Self { + Self::from_iter_values(v) + } +} + +impl From>> for StringViewArray { + fn from(v: Vec>) -> Self { + v.into_iter().collect() + } +} + #[cfg(test)] mod tests { use crate::builder::StringViewBuilder; diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index b3fb2d293a72..a3d805d099eb 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -119,6 +119,78 @@ pub fn create_string_array_with_len( .collect() } +/// Creates a random (but fixed-seeded) array of a given size, null density and length +pub fn create_string_view_array_with_len( + size: usize, + null_density: f32, + str_len: usize, + mixed: bool, +) -> StringViewArray { + let rng = &mut seedable_rng(); + + let mut lengths = Vec::with_capacity(size); + + // if mixed, we creates first half that string length small than 12 bytes and second half large than 12 bytes + if mixed { + for _ in 0..size / 2 { + lengths.push(rng.gen_range(1..12)); + } + for _ in size / 2..size { + lengths.push(rng.gen_range(12..=std::cmp::max(30, str_len))); + } + } else { + lengths.resize(size, str_len); + } + + lengths + .into_iter() + .map(|len| { + if rng.gen::() < null_density { + None + } else { + let value: Vec = rng.sample_iter(&Alphanumeric).take(len).collect(); + Some(String::from_utf8(value).unwrap()) + } + }) + .collect() +} + +/// Creates a random (but fixed-seeded) array of a given size, null density and length +pub fn create_binary_view_array_with_len( + size: usize, + null_density: f32, + str_len: usize, + mixed: bool, +) -> BinaryViewArray { + let rng = &mut seedable_rng(); + + let mut lengths = Vec::with_capacity(size); + + // if mixed, we creates first half that string length small than 12 bytes and second half large than 12 bytes + if mixed { + for _ in 0..size / 2 { + lengths.push(rng.gen_range(1..12)); + } + for _ in size / 2..size { + lengths.push(rng.gen_range(12..=std::cmp::max(30, str_len))); + } + } else { + lengths.resize(size, str_len); + } + + lengths + .into_iter() + .map(|len| { + if rng.gen::() < null_density { + None + } else { + let value: Vec = rng.sample_iter(&Alphanumeric).take(len).collect(); + Some(value) + } + }) + .collect() +} + /// Creates an random (but fixed-seeded) array of a given size and null density /// consisting of random 4 character alphanumeric strings pub fn create_string_dict_array( diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index c63aa6bba3e5..6c4c41f2c556 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -121,9 +121,21 @@ pub fn create_random_array( }, Utf8 => Arc::new(create_string_array::(size, null_density)), LargeUtf8 => Arc::new(create_string_array::(size, null_density)), + Utf8View => Arc::new(create_string_view_array_with_len( + size, + null_density, + 4, + false, + )), Binary => Arc::new(create_binary_array::(size, null_density)), LargeBinary => Arc::new(create_binary_array::(size, null_density)), FixedSizeBinary(len) => Arc::new(create_fsb_array(size, null_density, *len as usize)), + BinaryView => Arc::new(create_binary_view_array_with_len( + size, + null_density, + 4, + false, + )), List(_) => create_random_list_array(field, size, null_density, true_density)?, LargeList(_) => create_random_list_array(field, size, null_density, true_density)?, Struct(fields) => Arc::new(StructArray::try_from( diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 7eed86d2826e..e1853d755271 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -23,7 +23,8 @@ use criterion::{criterion_group, criterion_main, BenchmarkGroup, Criterion}; use num::FromPrimitive; use num_bigint::BigInt; use parquet::arrow::array_reader::{ - make_byte_array_reader, make_fixed_len_byte_array_reader, ListArrayReader, + make_byte_array_reader, make_byte_view_array_reader, make_fixed_len_byte_array_reader, + ListArrayReader, }; use parquet::basic::Type; use parquet::data_type::{ByteArray, FixedLenByteArrayType}; @@ -502,6 +503,13 @@ fn create_string_byte_array_reader( make_byte_array_reader(Box::new(page_iterator), column_desc, None).unwrap() } +fn create_string_view_byte_array_reader( + page_iterator: impl PageIterator + 'static, + column_desc: ColumnDescPtr, +) -> Box { + make_byte_view_array_reader(Box::new(page_iterator), column_desc, None).unwrap() +} + fn create_string_byte_array_dictionary_reader( page_iterator: impl PageIterator + 'static, column_desc: ColumnDescPtr, @@ -993,6 +1001,95 @@ fn add_benches(c: &mut Criterion) { group.finish(); + // string view benchmarks + //============================== + + let mut group = c.benchmark_group("arrow_array_reader/StringViewArray"); + + // string, plain encoded, no NULLs + let plain_string_no_null_data = + build_plain_encoded_string_page_iterator(mandatory_string_column_desc.clone(), 0.0); + group.bench_function("plain encoded, mandatory, no NULLs", |b| { + b.iter(|| { + let array_reader = create_string_view_byte_array_reader( + plain_string_no_null_data.clone(), + mandatory_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + let plain_string_no_null_data = + build_plain_encoded_string_page_iterator(optional_string_column_desc.clone(), 0.0); + group.bench_function("plain encoded, optional, no NULLs", |b| { + b.iter(|| { + let array_reader = create_string_view_byte_array_reader( + plain_string_no_null_data.clone(), + optional_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + // string, plain encoded, half NULLs + let plain_string_half_null_data = + build_plain_encoded_string_page_iterator(optional_string_column_desc.clone(), 0.5); + group.bench_function("plain encoded, optional, half NULLs", |b| { + b.iter(|| { + let array_reader = create_string_view_byte_array_reader( + plain_string_half_null_data.clone(), + optional_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + // string, dictionary encoded, no NULLs + let dictionary_string_no_null_data = + build_dictionary_encoded_string_page_iterator(mandatory_string_column_desc.clone(), 0.0); + group.bench_function("dictionary encoded, mandatory, no NULLs", |b| { + b.iter(|| { + let array_reader = create_string_view_byte_array_reader( + dictionary_string_no_null_data.clone(), + mandatory_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + let dictionary_string_no_null_data = + build_dictionary_encoded_string_page_iterator(optional_string_column_desc.clone(), 0.0); + group.bench_function("dictionary encoded, optional, no NULLs", |b| { + b.iter(|| { + let array_reader = create_string_view_byte_array_reader( + dictionary_string_no_null_data.clone(), + optional_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + // string, dictionary encoded, half NULLs + let dictionary_string_half_null_data = + build_dictionary_encoded_string_page_iterator(optional_string_column_desc.clone(), 0.5); + group.bench_function("dictionary encoded, optional, half NULLs", |b| { + b.iter(|| { + let array_reader = create_string_view_byte_array_reader( + dictionary_string_half_null_data.clone(), + optional_string_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + group.finish(); + // list benchmarks //============================== diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index b84e897db2f3..5e08fc7dadf8 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -96,6 +96,24 @@ fn create_string_bench_batch( )?) } +fn create_string_view_bench_batch( + size: usize, + null_density: f32, + true_density: f32, +) -> Result { + let fields = vec![ + Field::new("_1", DataType::Utf8View, true), + Field::new("_2", DataType::BinaryView, true), + ]; + let schema = Schema::new(fields); + Ok(create_random_batch( + Arc::new(schema), + size, + null_density, + true_density, + )?) +} + fn create_string_dictionary_bench_batch( size: usize, null_density: f32, @@ -395,6 +413,22 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap()) }); + let batch = create_string_view_bench_batch(4096, 0.25, 0.75).unwrap(); + group.throughput(Throughput::Bytes( + batch + .columns() + .iter() + .map(|f| f.get_array_memory_size() as u64) + .sum(), + )); + group.bench_function("4096 values string", |b| { + b.iter(|| write_batch(&batch).unwrap()) + }); + + group.bench_function("4096 values string with bloom filter", |b| { + b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap()) + }); + let batch = create_string_dictionary_bench_batch(4096, 0.25, 0.75).unwrap(); group.throughput(Throughput::Bytes( batch diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index 4ae0f5669e87..f662156543ea 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -46,6 +46,7 @@ mod test_util; pub use builder::build_array_reader; pub use byte_array::make_byte_array_reader; +pub use byte_array::make_byte_view_array_reader; pub use byte_array_dictionary::make_byte_array_dictionary_reader; #[allow(unused_imports)] // Only used for benchmarks pub use fixed_len_byte_array::make_fixed_len_byte_array_reader; diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index d42db0d9cbc1..bf4b88ac52d4 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1231,7 +1231,8 @@ mod tests { fn arrow_writer_binary_view() { let string_field = Field::new("a", DataType::Utf8View, false); let binary_field = Field::new("b", DataType::BinaryView, false); - let schema = Schema::new(vec![string_field, binary_field]); + let nullable_string_field = Field::new("a", DataType::Utf8View, true); + let schema = Schema::new(vec![string_field, binary_field, nullable_string_field]); let raw_string_values = vec!["foo", "bar", "large payload over 12 bytes", "lulu"]; let raw_binary_values = vec![ @@ -1240,16 +1241,24 @@ mod tests { b"large payload over 12 bytes".to_vec(), b"lulu".to_vec(), ]; + let nullable_string_values = + vec![Some("foo"), None, Some("large payload over 12 bytes"), None]; let string_view_values = StringViewArray::from(raw_string_values); let binary_view_values = BinaryViewArray::from_iter_values(raw_binary_values); + let nullable_string_view_values = StringViewArray::from(nullable_string_values); let batch = RecordBatch::try_new( Arc::new(schema), - vec![Arc::new(string_view_values), Arc::new(binary_view_values)], + vec![ + Arc::new(string_view_values), + Arc::new(binary_view_values), + Arc::new(nullable_string_view_values), + ], ) .unwrap(); - roundtrip(batch, Some(SMALL_SIZE / 2)); + roundtrip(batch.clone(), Some(SMALL_SIZE / 2)); + roundtrip(batch, None); } fn get_decimal_batch(precision: u8, scale: i8) -> RecordBatch { diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index ab6347d021a1..4f838caa3b8c 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -154,17 +154,14 @@ impl OffsetBuffer { } } - fn build_generic_byte_view(&self) -> GenericByteViewBuilder { + fn build_generic_byte_view(self) -> GenericByteViewBuilder { let mut builder = GenericByteViewBuilder::::with_capacity(self.len()); - for i in self.offsets.windows(2) { - let start = i[0]; - let end = i[1]; - let b = unsafe { - std::slice::from_raw_parts( - self.values.as_ptr().offset(start.to_isize().unwrap()), - (end - start).to_usize().unwrap(), - ) - }; + let mut values = self.values; + for window in self.offsets.windows(2) { + let start = window[0]; + let end = window[1]; + let len = (end - start).to_usize().unwrap(); + let b = values.drain(..len).collect::>(); if b.is_empty() { builder.append_null(); } else {