-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Improve concat performance, and add append_array for some array builder implementations
#7309
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
7245e26
5f4f91a
3c42e72
5e5be05
86e9ba1
060345d
55a1001
381ee83
2cab672
55df249
f9dd148
6b24c61
c64520e
0757bbb
1978d98
85e1416
aa1fdd4
70642d6
6ff3947
597112d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
|
|
||
| use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; | ||
| use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; | ||
| use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; | ||
| use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait}; | ||
| use arrow_buffer::NullBufferBuilder; | ||
| use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; | ||
| use arrow_data::ArrayDataBuilder; | ||
|
|
@@ -129,6 +129,48 @@ impl<T: ByteArrayType> GenericByteBuilder<T> { | |
| self.offsets_builder.append(self.next_offset()); | ||
| } | ||
|
|
||
| /// Appends array values and null to this builder as is | ||
| /// (this means that underlying null values are copied as is). | ||
| #[inline] | ||
| pub fn append_array(&mut self, array: &GenericByteArray<T>) { | ||
| if array.len() == 0 { | ||
| return; | ||
| } | ||
|
|
||
| let offsets = array.offsets(); | ||
|
|
||
| // If the offsets are contiguous, we can append them directly avoiding the need to align | ||
| // them | ||
| if self.next_offset() == offsets[0] { | ||
| self.offsets_builder.append_slice(&offsets[1..]); | ||
| } else { | ||
| // Shifting all the offsets | ||
| let shift: T::Offset = self.next_offset() - offsets[0]; | ||
|
|
||
| // Creating intermediate offsets instead of pushing each offset is faster | ||
| // (even if we make MutableBuffer to avoid updating length on each push | ||
| // and reserve the necessary capacity, it's still slower) | ||
| let mut intermediate = Vec::with_capacity(offsets.len() - 1); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤔 maybe we need something like
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think many of these instances could be changed to use
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed follow on |
||
|
|
||
| for &offset in &offsets[1..] { | ||
| intermediate.push(offset + shift) | ||
| } | ||
|
|
||
| self.offsets_builder.append_slice(&intermediate); | ||
| } | ||
|
|
||
| // Append underlying values, starting from the first offset and ending at the last offset | ||
| self.value_builder.append_slice( | ||
| &array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()], | ||
| ); | ||
|
|
||
| if let Some(null_buffer) = array.nulls() { | ||
| self.null_buffer_builder.append_buffer(null_buffer); | ||
| } else { | ||
| self.null_buffer_builder.append_n_non_nulls(array.len()); | ||
| } | ||
| } | ||
|
|
||
| /// Builds the [`GenericByteArray`] and reset this builder. | ||
| pub fn finish(&mut self) -> GenericByteArray<T> { | ||
| let array_type = T::DATA_TYPE; | ||
|
|
@@ -593,4 +635,101 @@ mod tests { | |
| &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] | ||
| ) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_append_array_without_nulls() { | ||
| let input = vec![ | ||
| "hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", | ||
| "thank", "you", "for", "asking", | ||
| ]; | ||
| let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); | ||
| let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); | ||
| let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); | ||
|
|
||
| let mut builder = GenericStringBuilder::<i32>::new(); | ||
| builder.append_array(&arr1); | ||
| builder.append_array(&arr2); | ||
| builder.append_array(&arr3); | ||
|
|
||
| let actual = builder.finish(); | ||
| let expected = GenericStringArray::<i32>::from(input); | ||
|
|
||
| assert_eq!(actual, expected); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_append_array_with_nulls() { | ||
| let input = vec![ | ||
| Some("hello"), | ||
| None, | ||
| Some("how"), | ||
| None, | ||
| None, | ||
| None, | ||
| None, | ||
| Some("I"), | ||
| Some("am"), | ||
| Some("doing"), | ||
| Some("well"), | ||
| ]; | ||
| let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); | ||
| let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); | ||
| let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); | ||
|
|
||
| let mut builder = GenericStringBuilder::<i32>::new(); | ||
| builder.append_array(&arr1); | ||
| builder.append_array(&arr2); | ||
| builder.append_array(&arr3); | ||
|
|
||
| let actual = builder.finish(); | ||
| let expected = GenericStringArray::<i32>::from(input); | ||
|
|
||
| assert_eq!(actual, expected); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_append_empty_array() { | ||
rluvaton marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| let arr = GenericStringArray::<i32>::from(Vec::<&str>::new()); | ||
| let mut builder = GenericStringBuilder::<i32>::new(); | ||
| builder.append_array(&arr); | ||
| let result = builder.finish(); | ||
| assert_eq!(result.len(), 0); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_append_array_with_offset_not_starting_at_0() { | ||
| let input = vec![ | ||
| Some("hello"), | ||
| None, | ||
| Some("how"), | ||
| None, | ||
| None, | ||
| None, | ||
| None, | ||
| Some("I"), | ||
| Some("am"), | ||
| Some("doing"), | ||
| Some("well"), | ||
| ]; | ||
| let full_array = GenericStringArray::<i32>::from(input); | ||
| let sliced = full_array.slice(1, 4); | ||
|
|
||
| assert_ne!(sliced.offsets()[0].as_usize(), 0); | ||
| assert_ne!(sliced.offsets().last(), full_array.offsets().last()); | ||
|
|
||
| let mut builder = GenericStringBuilder::<i32>::new(); | ||
| builder.append_array(&sliced); | ||
| let actual = builder.finish(); | ||
|
|
||
| let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]); | ||
|
|
||
| println!("actual: {:?}", actual); | ||
| println!("expected: {:?}", expected); | ||
| assert_eq!(actual, expected); | ||
rluvaton marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| #[test] | ||
| fn test_append_underlying_null_values_added_as_is() { | ||
| // TODO | ||
rluvaton marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.