Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion rust/arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1267,6 +1267,37 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {
GenericBinaryArray::<OffsetSize>::from(array_data)
}

fn from_opt_vec(v: Vec<Option<&[u8]>>, data_type: DataType) -> Self {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this code tested somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is tested (indirectly) in https://github.com/apache/arrow/pull/8303/files#diff-d7b0b7cde1850e8744ceda458c6dea81R700 -- but I think a more specific test would be valuable. I will add one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This turned out to be a great call @jorgecarleitao -- I found a bug in this implementation while writing a test. Thank you for the suggestion. 💯

let mut offsets = Vec::with_capacity(v.len() + 1);
let mut values = Vec::new();
let mut null_buf = make_null_buffer(v.len());
let mut length_so_far: OffsetSize = OffsetSize::zero();
offsets.push(length_so_far);

{
let null_slice = null_buf.data_mut();

for (i, s) in v.iter().enumerate() {
if let Some(s) = s {
bit_util::set_bit(null_slice, i);
length_so_far =
length_so_far + OffsetSize::from_usize(s.len()).unwrap();
values.extend_from_slice(s);
}
// always add an element in offsets
offsets.push(length_so_far);
}
}

let array_data = ArrayData::builder(data_type)
.len(v.len())
.add_buffer(Buffer::from(offsets.to_byte_slice()))
.add_buffer(Buffer::from(&values[..]))
.null_bit_buffer(null_buf.freeze())
.build();
GenericBinaryArray::<OffsetSize>::from(array_data)
}

fn from_list(v: GenericListArray<OffsetSize>, datatype: DataType) -> Self {
assert_eq!(
v.data_ref().child_data()[0].child_data().len(),
Expand Down Expand Up @@ -1368,12 +1399,24 @@ impl From<Vec<&[u8]>> for BinaryArray {
}
}

impl From<Vec<Option<&[u8]>>> for BinaryArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
BinaryArray::from_opt_vec(v, DataType::Binary)
}
}

impl From<Vec<&[u8]>> for LargeBinaryArray {
fn from(v: Vec<&[u8]>) -> Self {
LargeBinaryArray::from_vec(v, DataType::LargeBinary)
}
}

impl From<Vec<Option<&[u8]>>> for LargeBinaryArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
LargeBinaryArray::from_opt_vec(v, DataType::LargeBinary)
}
}

impl From<ListArray> for BinaryArray {
fn from(v: ListArray) -> Self {
BinaryArray::from_list(v, DataType::Binary)
Expand Down Expand Up @@ -1846,7 +1889,7 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
if let Some(len) = len {
if len != child_datum.len() {
return Err(ArrowError::InvalidArgumentError(
format!("Array of field \"{}\" has length {}, but previous elements have length {}.
format!("Array of field \"{}\" has length {}, but previous elements have length {}.
All arrays in every entry in a struct array must have the same length.", field_name, child_datum.len(), len)
));
}
Expand Down Expand Up @@ -3511,6 +3554,40 @@ mod tests {
}
}

#[test]
fn test_binary_array_from_opt_vec() {
let values: Vec<Option<&[u8]>> =
vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")];
let array = BinaryArray::from_opt_vec(values, DataType::Binary);
assert_eq!(array.len(), 5);
assert_eq!(array.value(0), b"one");
assert_eq!(array.value(1), b"two");
assert_eq!(array.value(3), b"");
assert_eq!(array.value(4), b"three");
assert_eq!(array.is_null(0), false);
assert_eq!(array.is_null(1), false);
assert_eq!(array.is_null(2), true);
assert_eq!(array.is_null(3), false);
assert_eq!(array.is_null(4), false);
}

#[test]
fn test_large_binary_array_from_opt_vec() {
let values: Vec<Option<&[u8]>> =
vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")];
let array = LargeBinaryArray::from_opt_vec(values, DataType::LargeBinary);
assert_eq!(array.len(), 5);
assert_eq!(array.value(0), b"one");
assert_eq!(array.value(1), b"two");
assert_eq!(array.value(3), b"");
assert_eq!(array.value(4), b"three");
assert_eq!(array.is_null(0), false);
assert_eq!(array.is_null(1), false);
assert_eq!(array.is_null(2), true);
assert_eq!(array.is_null(3), false);
assert_eq!(array.is_null(4), false);
}

#[test]
fn test_string_array_from_u8_slice() {
let values: Vec<&str> = vec!["hello", "", "parquet"];
Expand Down
43 changes: 38 additions & 5 deletions rust/arrow/src/compute/kernels/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ impl FilterContext {
}
DataType::Binary => {
let input_array = array.as_any().downcast_ref::<BinaryArray>().unwrap();
let mut values: Vec<&[u8]> = Vec::with_capacity(self.filtered_count);
let mut values: Vec<Option<&[u8]>> = Vec::with_capacity(self.filtered_count);
for i in 0..self.filter_u64.len() {
// foreach u64 batch
let filter_batch = self.filter_u64[i];
Expand All @@ -353,15 +353,19 @@ impl FilterContext {
// foreach bit in batch:
if (filter_batch & self.filter_mask[j]) != 0 {
let data_index = (i * 64) + j;
values.push(input_array.value(data_index));
if input_array.is_null(data_index) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(this is the code for BinaryArray that @nevi-me referred to in #8303 (comment))

values.push(None)
} else {
values.push(Some(input_array.value(data_index)))
}
}
}
}
Ok(Arc::new(BinaryArray::from(values)))
}
DataType::Utf8 => {
let input_array = array.as_any().downcast_ref::<StringArray>().unwrap();
let mut values: Vec<&str> = Vec::with_capacity(self.filtered_count);
let mut values: Vec<Option<&str>> = Vec::with_capacity(self.filtered_count);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note using an Option is likely to increase the temporary storage requirements a bit.

It would likely be possible to avoid this allocation entirely if we used the lower level ArrayBuilder::with_bit_buffer.

I chose to follow the style of the rest of this module, though I would love opinions on trying to perf check this / optimize it (maybe a follow on JIRA ticket is enough)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't an Option of a reference leverages null pointer optimization (on which None is represented by the null pointer, e.g. rust-lang/rust#9378)?

IMO we should follow up on this: for kernels we have been using a mutable buffer with null masks as much as possible.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't an Option of a reference leverages null pointer optimization (on which None is represented by the null pointer, e.g. rust-lang/rust#9378)?

Yes, I believe you are correct.

This program:

fn main() {
    println!("The size of a &str is {}", std::mem::size_of::<&str>());
    println!("The size of an Option<&str> is {}", std::mem::size_of::<Option<&str>>());
}

Produces the following on my machine:

The size of a &str is 16
The size of an Option<&str> is 16

for i in 0..self.filter_u64.len() {
// foreach u64 batch
let filter_batch = self.filter_u64[i];
Expand All @@ -373,7 +377,11 @@ impl FilterContext {
// foreach bit in batch:
if (filter_batch & self.filter_mask[j]) != 0 {
let data_index = (i * 64) + j;
values.push(input_array.value(data_index));
if input_array.is_null(data_index) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likewise, this special case appears to miss the null check too

values.push(None)
} else {
values.push(Some(input_array.value(data_index)))
}
}
}
}
Expand Down Expand Up @@ -666,7 +674,7 @@ mod tests {
}

#[test]
fn test_filter_array_with_null() {
fn test_filter_primative_array_with_null() {
let a = Int32Array::from(vec![Some(5), None]);
let b = BooleanArray::from(vec![false, true]);
let c = filter(&a, &b).unwrap();
Expand All @@ -675,6 +683,31 @@ mod tests {
assert_eq!(true, d.is_null(0));
}

#[test]
fn test_filter_string_array_with_null() {
let a = StringArray::from(vec![Some("hello"), None, Some("world"), None]);
let b = BooleanArray::from(vec![true, false, false, true]);
let c = filter(&a, &b).unwrap();
let d = c.as_ref().as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(2, d.len());
assert_eq!("hello", d.value(0));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest that we test the 3 quantities: d.is_null(0), d.value(0), d.is_null(1). Alternatively,

let expected = StringArray::from(vec![Some("hello"), None]);
assert_eq!(d, expected);

assert_eq!(false, d.is_null(0));
assert_eq!(true, d.is_null(1));
}

#[test]
fn test_filter_binary_array_with_null() {
let data: Vec<Option<&[u8]>> = vec![Some(b"hello"), None, Some(b"world"), None];
let a = BinaryArray::from(data);
let b = BooleanArray::from(vec![true, false, false, true]);
let c = filter(&a, &b).unwrap();
let d = c.as_ref().as_any().downcast_ref::<BinaryArray>().unwrap();
assert_eq!(2, d.len());
assert_eq!(b"hello", d.value(0));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here.

assert_eq!(false, d.is_null(0));
assert_eq!(true, d.is_null(1));
}

#[test]
fn test_filter_array_slice_with_null() {
let a_slice =
Expand Down