From 18efc723e62a9ddc452d2b9bf255b7af82ad5085 Mon Sep 17 00:00:00 2001 From: alamb Date: Tue, 29 Sep 2020 13:21:43 -0400 Subject: [PATCH 1/3] ARROW-10136: [Rust][DataFusion]: Fix null handling in string filter kernels --- rust/arrow/src/array/array.rs | 44 +++++++++++++++++++++++- rust/arrow/src/compute/kernels/filter.rs | 41 +++++++++++++++++++--- 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index bfbfe39f1c9..2111505f369 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -1267,6 +1267,36 @@ impl GenericBinaryArray { GenericBinaryArray::::from(array_data) } + fn from_opt_vec(v: Vec>, data_type: DataType) -> Self { + let mut offsets = Vec::with_capacity(v.len() + 1); + let mut values = Vec::new(); + let mut null_buf = make_null_buffer(v.len()); + let mut length_so_far: OffsetSize = OffsetSize::zero(); + offsets.push(length_so_far); + + { + let null_slice = null_buf.data_mut(); + + for (i, s) in v.iter().enumerate() { + if let Some(s) = s { + bit_util::set_bit(null_slice, i); + length_so_far = + length_so_far + OffsetSize::from_usize(s.len()).unwrap(); + offsets.push(length_so_far); + values.extend_from_slice(s); + } + } + } + + let array_data = ArrayData::builder(data_type) + .len(v.len()) + .add_buffer(Buffer::from(offsets.to_byte_slice())) + .add_buffer(Buffer::from(&values[..])) + .null_bit_buffer(null_buf.freeze()) + .build(); + GenericBinaryArray::::from(array_data) + } + fn from_list(v: GenericListArray, datatype: DataType) -> Self { assert_eq!( v.data_ref().child_data()[0].child_data().len(), @@ -1368,12 +1398,24 @@ impl From> for BinaryArray { } } +impl From>> for BinaryArray { + fn from(v: Vec>) -> Self { + BinaryArray::from_opt_vec(v, DataType::Binary) + } +} + impl From> for LargeBinaryArray { fn from(v: Vec<&[u8]>) -> Self { LargeBinaryArray::from_vec(v, DataType::LargeBinary) } } +impl From>> for LargeBinaryArray { + fn from(v: Vec>) -> Self { + LargeBinaryArray::from_opt_vec(v, DataType::LargeBinary) + } +} + impl From for BinaryArray { fn from(v: ListArray) -> Self { BinaryArray::from_list(v, DataType::Binary) @@ -1846,7 +1888,7 @@ impl TryFrom> for StructArray { if let Some(len) = len { if len != child_datum.len() { return Err(ArrowError::InvalidArgumentError( - format!("Array of field \"{}\" has length {}, but previous elements have length {}. + format!("Array of field \"{}\" has length {}, but previous elements have length {}. All arrays in every entry in a struct array must have the same length.", field_name, child_datum.len(), len) )); } diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs index ff0ac27def1..3a716bedcb4 100644 --- a/rust/arrow/src/compute/kernels/filter.rs +++ b/rust/arrow/src/compute/kernels/filter.rs @@ -341,7 +341,7 @@ impl FilterContext { } DataType::Binary => { let input_array = array.as_any().downcast_ref::().unwrap(); - let mut values: Vec<&[u8]> = Vec::with_capacity(self.filtered_count); + let mut values: Vec> = Vec::with_capacity(self.filtered_count); for i in 0..self.filter_u64.len() { // foreach u64 batch let filter_batch = self.filter_u64[i]; @@ -353,7 +353,11 @@ impl FilterContext { // foreach bit in batch: if (filter_batch & self.filter_mask[j]) != 0 { let data_index = (i * 64) + j; - values.push(input_array.value(data_index)); + if input_array.is_null(data_index) { + values.push(None) + } else { + values.push(Some(input_array.value(data_index))) + } } } } @@ -361,7 +365,7 @@ impl FilterContext { } DataType::Utf8 => { let input_array = array.as_any().downcast_ref::().unwrap(); - let mut values: Vec<&str> = Vec::with_capacity(self.filtered_count); + let mut values: Vec> = Vec::with_capacity(self.filtered_count); for i in 0..self.filter_u64.len() { // foreach u64 batch let filter_batch = self.filter_u64[i]; @@ -373,7 +377,11 @@ impl FilterContext { // foreach bit in batch: if (filter_batch & self.filter_mask[j]) != 0 { let data_index = (i * 64) + j; - values.push(input_array.value(data_index)); + if input_array.is_null(data_index) { + values.push(None) + } else { + values.push(Some(input_array.value(data_index))) + } } } } @@ -666,7 +674,7 @@ mod tests { } #[test] - fn test_filter_array_with_null() { + fn test_filter_primative_array_with_null() { let a = Int32Array::from(vec![Some(5), None]); let b = BooleanArray::from(vec![false, true]); let c = filter(&a, &b).unwrap(); @@ -675,6 +683,29 @@ mod tests { assert_eq!(true, d.is_null(0)); } + #[test] + fn test_filter_string_array_with_null() { + let a = StringArray::from(vec![Some("hello"), None, Some("world"), None]); + let b = BooleanArray::from(vec![true, false, false, true]); + let c = filter(&a, &b).unwrap(); + let d = c.as_ref().as_any().downcast_ref::().unwrap(); + assert_eq!(2, d.len()); + assert_eq!("hello", d.value(0)); + assert_eq!(true, d.is_null(1)); + } + + #[test] + fn test_filter_binary_array_with_null() { + let data: Vec> = vec![Some(b"hello"), None, Some(b"world"), None]; + let a = BinaryArray::from(data); + let b = BooleanArray::from(vec![true, false, false, true]); + let c = filter(&a, &b).unwrap(); + let d = c.as_ref().as_any().downcast_ref::().unwrap(); + assert_eq!(2, d.len()); + assert_eq!(b"hello", d.value(0)); + assert_eq!(true, d.is_null(1)); + } + #[test] fn test_filter_array_slice_with_null() { let a_slice = From 036eca9cd4a085b767255a1b55449ef8dac31762 Mon Sep 17 00:00:00 2001 From: alamb Date: Wed, 30 Sep 2020 06:46:18 -0400 Subject: [PATCH 2/3] Add additional coverage for is_null filter --- rust/arrow/src/compute/kernels/filter.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs index 3a716bedcb4..c6cd711ba1b 100644 --- a/rust/arrow/src/compute/kernels/filter.rs +++ b/rust/arrow/src/compute/kernels/filter.rs @@ -691,6 +691,7 @@ mod tests { let d = c.as_ref().as_any().downcast_ref::().unwrap(); assert_eq!(2, d.len()); assert_eq!("hello", d.value(0)); + assert_eq!(false, d.is_null(0)); assert_eq!(true, d.is_null(1)); } @@ -703,6 +704,7 @@ mod tests { let d = c.as_ref().as_any().downcast_ref::().unwrap(); assert_eq!(2, d.len()); assert_eq!(b"hello", d.value(0)); + assert_eq!(false, d.is_null(0)); assert_eq!(true, d.is_null(1)); } From b83d867cbcf7cbe2d3bd97c0262a2dd7da0c3f00 Mon Sep 17 00:00:00 2001 From: alamb Date: Wed, 30 Sep 2020 06:59:17 -0400 Subject: [PATCH 3/3] Add test for {Large,}BinaryArray::from_opt_vec --- rust/arrow/src/array/array.rs | 37 ++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index 2111505f369..895629627fd 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -1282,9 +1282,10 @@ impl GenericBinaryArray { bit_util::set_bit(null_slice, i); length_so_far = length_so_far + OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); values.extend_from_slice(s); } + // always add an element in offsets + offsets.push(length_so_far); } } @@ -3553,6 +3554,40 @@ mod tests { } } + #[test] + fn test_binary_array_from_opt_vec() { + let values: Vec> = + vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")]; + let array = BinaryArray::from_opt_vec(values, DataType::Binary); + assert_eq!(array.len(), 5); + assert_eq!(array.value(0), b"one"); + assert_eq!(array.value(1), b"two"); + assert_eq!(array.value(3), b""); + assert_eq!(array.value(4), b"three"); + assert_eq!(array.is_null(0), false); + assert_eq!(array.is_null(1), false); + assert_eq!(array.is_null(2), true); + assert_eq!(array.is_null(3), false); + assert_eq!(array.is_null(4), false); + } + + #[test] + fn test_large_binary_array_from_opt_vec() { + let values: Vec> = + vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")]; + let array = LargeBinaryArray::from_opt_vec(values, DataType::LargeBinary); + assert_eq!(array.len(), 5); + assert_eq!(array.value(0), b"one"); + assert_eq!(array.value(1), b"two"); + assert_eq!(array.value(3), b""); + assert_eq!(array.value(4), b"three"); + assert_eq!(array.is_null(0), false); + assert_eq!(array.is_null(1), false); + assert_eq!(array.is_null(2), true); + assert_eq!(array.is_null(3), false); + assert_eq!(array.is_null(4), false); + } + #[test] fn test_string_array_from_u8_slice() { let values: Vec<&str> = vec!["hello", "", "parquet"];