-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-4605: [Rust] Move filter and limit code from DataFusion into compute module #3741
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
d216fa0
0ca0412
32a2f85
b20ea6d
2a389a3
6422e18
2f44a8a
2e9616b
5a1047c
58d1f5c
728884b
f0578f6
257d235
344379a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,10 +17,16 @@ | |
|
|
||
| //! Defines primitive computations on arrays, e.g. addition, equality, boolean logic. | ||
|
|
||
| use std::cmp; | ||
| use std::ops::Add; | ||
| use std::sync::Arc; | ||
|
|
||
| use crate::array::{Array, BooleanArray, PrimitiveArray}; | ||
| use crate::datatypes::ArrowNumericType; | ||
| use crate::array::{ | ||
| Array, ArrayRef, BinaryArray, BooleanArray, Float32Array, Float64Array, Int16Array, | ||
| Int32Array, Int64Array, Int8Array, PrimitiveArray, UInt16Array, UInt32Array, | ||
| UInt64Array, UInt8Array, | ||
| }; | ||
| use crate::datatypes::{ArrowNumericType, DataType}; | ||
| use crate::error::{ArrowError, Result}; | ||
|
|
||
| /// Returns the minimum value in the array, according to the natural order. | ||
|
|
@@ -204,6 +210,93 @@ where | |
| Ok(b.finish()) | ||
| } | ||
|
|
||
| macro_rules! filter_array { | ||
| ($array:expr, $filter:expr, $array_type:ident) => {{ | ||
| let b = $array.as_any().downcast_ref::<$array_type>().unwrap(); | ||
| let mut builder = $array_type::builder(b.len()); | ||
| for i in 0..b.len() { | ||
| if $filter.value(i) { | ||
| builder.append_value(b.value(i))?; | ||
| } | ||
| } | ||
| Ok(Arc::new(builder.finish())) | ||
| }}; | ||
| } | ||
|
|
||
| pub fn filter(array: &Array, filter: &BooleanArray) -> Result<ArrayRef> { | ||
| match array.data_type() { | ||
| DataType::UInt8 => filter_array!(array, filter, UInt8Array), | ||
| DataType::UInt16 => filter_array!(array, filter, UInt16Array), | ||
| DataType::UInt32 => filter_array!(array, filter, UInt32Array), | ||
| DataType::UInt64 => filter_array!(array, filter, UInt64Array), | ||
| DataType::Int8 => filter_array!(array, filter, Int8Array), | ||
| DataType::Int16 => filter_array!(array, filter, Int16Array), | ||
| DataType::Int32 => filter_array!(array, filter, Int32Array), | ||
| DataType::Int64 => filter_array!(array, filter, Int64Array), | ||
| DataType::Float32 => filter_array!(array, filter, Float32Array), | ||
| DataType::Float64 => filter_array!(array, filter, Float64Array), | ||
| DataType::Boolean => filter_array!(array, filter, BooleanArray), | ||
| DataType::Utf8 => { | ||
| let b = array.as_any().downcast_ref::<BinaryArray>().unwrap(); | ||
| let mut values: Vec<&[u8]> = Vec::with_capacity(b.len()); | ||
| for i in 0..b.len() { | ||
| if filter.value(i) { | ||
| values.push(b.value(i)); | ||
| } | ||
| } | ||
| Ok(Arc::new(BinaryArray::from(values))) | ||
| } | ||
| other => Err(ArrowError::ComputeError(format!( | ||
| "filter not supported for {:?}", | ||
| other | ||
| ))), | ||
| } | ||
| } | ||
|
|
||
| macro_rules! limit_array { | ||
| ($array:expr, $num_elements:expr, $array_type:ident) => {{ | ||
| let b = $array.as_any().downcast_ref::<$array_type>().unwrap(); | ||
| let mut builder = $array_type::builder($num_elements); | ||
| for i in 0..$num_elements { | ||
| builder.append_value(b.value(i))?; | ||
| } | ||
| Ok(Arc::new(builder.finish())) | ||
| }}; | ||
| } | ||
|
|
||
| /// Returns the array, taking only the number of elements specified | ||
| /// | ||
| /// Returns the whole array if the number of elements specified is larger than the length of the array | ||
| pub fn limit(array: &Array, num_elements: usize) -> Result<ArrayRef> { | ||
| let num_elements_safe: usize = cmp::min(array.len(), num_elements); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One last nit, we could return the array as I'm happy with everything else
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you can help me here: how can I wrap the reference to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, you're right, I missed that part. We can improve limit when we have zero-copy array slicing 👍🏾 |
||
|
|
||
| match array.data_type() { | ||
| DataType::UInt8 => limit_array!(array, num_elements_safe, UInt8Array), | ||
| DataType::UInt16 => limit_array!(array, num_elements_safe, UInt16Array), | ||
| DataType::UInt32 => limit_array!(array, num_elements_safe, UInt32Array), | ||
| DataType::UInt64 => limit_array!(array, num_elements_safe, UInt64Array), | ||
| DataType::Int8 => limit_array!(array, num_elements_safe, Int8Array), | ||
| DataType::Int16 => limit_array!(array, num_elements_safe, Int16Array), | ||
| DataType::Int32 => limit_array!(array, num_elements_safe, Int32Array), | ||
| DataType::Int64 => limit_array!(array, num_elements_safe, Int64Array), | ||
| DataType::Float32 => limit_array!(array, num_elements_safe, Float32Array), | ||
| DataType::Float64 => limit_array!(array, num_elements_safe, Float64Array), | ||
| DataType::Boolean => limit_array!(array, num_elements_safe, BooleanArray), | ||
| DataType::Utf8 => { | ||
| let b = array.as_any().downcast_ref::<BinaryArray>().unwrap(); | ||
| let mut values: Vec<&[u8]> = Vec::with_capacity(num_elements_safe); | ||
| for i in 0..num_elements_safe { | ||
| values.push(b.value(i)); | ||
| } | ||
| Ok(Arc::new(BinaryArray::from(values))) | ||
| } | ||
| other => Err(ArrowError::ComputeError(format!( | ||
| "limit not supported for {:?}", | ||
| other | ||
| ))), | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
|
|
@@ -358,4 +451,61 @@ mod tests { | |
| assert_eq!(5, min(&a).unwrap()); | ||
| assert_eq!(9, max(&a).unwrap()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_filter_array() { | ||
| let a = Int32Array::from(vec![5, 6, 7, 8, 9]); | ||
| let b = BooleanArray::from(vec![true, false, false, true, false]); | ||
| let c = filter(&a, &b).unwrap(); | ||
| let d = c.as_ref().as_any().downcast_ref::<Int32Array>().unwrap(); | ||
| assert_eq!(2, d.len()); | ||
| assert_eq!(5, d.value(0)); | ||
| assert_eq!(8, d.value(1)); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_filter_binary_array() { | ||
| let a = BinaryArray::from(vec!["hello", " ", "world", "!"]); | ||
| let b = BooleanArray::from(vec![true, false, true, false]); | ||
| let c = filter(&a, &b).unwrap(); | ||
| let d = c.as_ref().as_any().downcast_ref::<BinaryArray>().unwrap(); | ||
| assert_eq!(2, d.len()); | ||
| assert_eq!("hello", d.get_string(0)); | ||
| assert_eq!("world", d.get_string(1)); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_limit_array() { | ||
| let a = Int32Array::from(vec![5, 6, 7, 8, 9]); | ||
| let b = limit(&a, 3).unwrap(); | ||
| let c = b.as_ref().as_any().downcast_ref::<Int32Array>().unwrap(); | ||
| assert_eq!(3, c.len()); | ||
| assert_eq!(5, c.value(0)); | ||
| assert_eq!(6, c.value(1)); | ||
| assert_eq!(7, c.value(2)); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_limit_binary_array() { | ||
| let a = BinaryArray::from(vec!["hello", " ", "world", "!"]); | ||
| let b = limit(&a, 2).unwrap(); | ||
| let c = b.as_ref().as_any().downcast_ref::<BinaryArray>().unwrap(); | ||
| assert_eq!(2, c.len()); | ||
| assert_eq!("hello", c.get_string(0)); | ||
| assert_eq!(" ", c.get_string(1)); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_limit_array_with_limit_too_large() { | ||
| let a = Int32Array::from(vec![5, 6, 7, 8, 9]); | ||
| let b = limit(&a, 6).unwrap(); | ||
| let c = b.as_ref().as_any().downcast_ref::<Int32Array>().unwrap(); | ||
|
|
||
| assert_eq!(5, c.len()); | ||
| assert_eq!(a.value(0), c.value(0)); | ||
| assert_eq!(a.value(1), c.value(1)); | ||
| assert_eq!(a.value(2), c.value(2)); | ||
| assert_eq!(a.value(3), c.value(3)); | ||
| assert_eq!(a.value(4), c.value(4)); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be more efficient to initialize these vectors with
Vec::with_capacity(v.len())