diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs index d0e3f22f541..aafe1cd7855 100644 --- a/rust/arrow/src/compute/kernels/aggregate.rs +++ b/rust/arrow/src/compute/kernels/aggregate.rs @@ -19,10 +19,14 @@ use std::ops::Add; +use super::sort::{total_cmp_32, total_cmp_64}; use crate::array::{ - Array, BooleanArray, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait, + Array, BooleanArray, Float32Array, Float64Array, GenericStringArray, PrimitiveArray, + StringOffsetSizeTrait, }; -use crate::datatypes::{ArrowNativeType, ArrowNumericType}; +use crate::datatypes::{ArrowNativeType, ArrowNumericType, DataType}; +use num::{Float, NumCast}; +use std::cmp::Ordering; /// Generic test for NaN, the optimizer should be able to remove this for integer types. #[inline] @@ -66,6 +70,141 @@ fn min_max_string bool>( Some(n) } +#[derive(Debug)] +pub enum NaNBehavior { + Propagate, + Ignore, + TotalOrdering, +} + +impl Default for NaNBehavior { + fn default() -> Self { + NaNBehavior::TotalOrdering + } +} + +/// Returns the minimum value in the array, according to the natural order. +/// This kernel accepts Options to define the NaN behavior. Note that this is different than missing data. +pub fn min_float( + array: &PrimitiveArray, + nan_behavior: Option, +) -> Option +where + T: ArrowNumericType, + T::Native: Float, +{ + let nan_behavior = nan_behavior.unwrap_or_default(); + + fn ignore_path(min: &F, b: &F) -> bool { + // if evaluates to true b will be written to output + match (min.is_nan(), b.is_nan()) { + (true, true) => false, + // NaN < b + (true, false) => true, + // a < NaN + (false, true) => false, + (false, false) => min > b, + } + } + + match array.data_type() { + DataType::Float32 => { + let array = array + .as_any() + .downcast_ref::() + .expect("f32 array"); + let out = match nan_behavior { + NaNBehavior::Propagate => { + min_max_helper(array, |a, b| !ignore_path(a, b)) + } + NaNBehavior::TotalOrdering => min_max_helper(array, |a, b| { + total_cmp_32(*a, *b) == Ordering::Greater + }), + NaNBehavior::Ignore => min_max_helper(array, ignore_path), + }; + out.map(|float| NumCast::from(float).expect("T::Native")) + } + DataType::Float64 => { + let array = array + .as_any() + .downcast_ref::() + .expect("f64 array"); + let out = match nan_behavior { + NaNBehavior::Propagate => { + min_max_helper(array, |a, b| !ignore_path(a, b)) + } + NaNBehavior::TotalOrdering => min_max_helper(array, |a, b| { + total_cmp_64(*a, *b) == Ordering::Greater + }), + NaNBehavior::Ignore => min_max_helper(array, ignore_path), + }; + out.map(|float| NumCast::from(float).expect("T::Native")) + } + _ => unreachable!(), + } +} + +/// Returns the maximum value in the array, according to the natural order. +/// This kernel accepts Options to define the NaN behavior. Note that this is different than missing data. +pub fn max_float( + array: &PrimitiveArray, + nan_behavior: Option, +) -> Option +where + T: ArrowNumericType, + T::Native: Float, +{ + let nan_behavior = nan_behavior.unwrap_or_default(); + + fn ignore_path(max: &F, b: &F) -> bool { + // if evaluates to true b will be written to output + match (max.is_nan(), b.is_nan()) { + (true, true) => false, + // NaN < b + (true, false) => true, + // a < NaN + (false, true) => false, + (false, false) => max < b, + } + } + + match array.data_type() { + DataType::Float32 => { + let array = array + .as_any() + .downcast_ref::() + .expect("f32 array"); + let out = match nan_behavior { + NaNBehavior::Propagate => { + min_max_helper(array, |a, b| !ignore_path(a, b)) + } + NaNBehavior::TotalOrdering => { + min_max_helper(array, |a, b| total_cmp_32(*a, *b) == Ordering::Less) + } + NaNBehavior::Ignore => min_max_helper(array, ignore_path), + }; + out.map(|float| NumCast::from(float).expect("T::Native")) + } + DataType::Float64 => { + let array = array + .as_any() + .downcast_ref::() + .expect("f64 array"); + let out = match nan_behavior { + NaNBehavior::Propagate => { + min_max_helper(array, |a, b| !ignore_path(a, b)) + } + NaNBehavior::TotalOrdering => { + min_max_helper(array, |a, b| total_cmp_64(*a, *b) == Ordering::Less) + } + NaNBehavior::Ignore => min_max_helper(array, ignore_path), + }; + out.map(|float| NumCast::from(float).expect("T::Native")) + } + _ => unreachable!(), + } +} + /// Returns the minimum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value #[cfg(not(simd))] @@ -828,6 +967,33 @@ mod tests { assert!(max(&a).unwrap().is_nan()); } + #[test] + #[allow(clippy::float_cmp)] + fn test_float_aggregation_paths() { + let a = Float64Array::from_iter_values(vec![1.0, 2.0, f64::NAN]); + let b = Float64Array::from_iter_values(vec![f64::NAN, 1.0, 2.0]); + let c = Float64Array::from_iter_values(vec![2.0, f64::NAN, 1.0]); + + for array in &[a, b, c] { + assert_eq!(min_float(array, Some(NaNBehavior::Ignore)).unwrap(), 1.0); + assert_eq!( + min_float(array, Some(NaNBehavior::TotalOrdering)).unwrap(), + 1.0 + ); + assert!(min_float(array, Some(NaNBehavior::Propagate)) + .unwrap() + .is_nan()); + + assert_eq!(max_float(array, Some(NaNBehavior::Ignore)).unwrap(), 2.0); + assert!(max_float(array, Some(NaNBehavior::TotalOrdering)) + .unwrap() + .is_nan()); + assert!(max_float(array, Some(NaNBehavior::Propagate)) + .unwrap() + .is_nan()); + } + } + #[test] fn test_primitive_min_max_float_last_nan_nonnull() { let a: Float64Array = (0..100) diff --git a/rust/arrow/src/compute/kernels/sort.rs b/rust/arrow/src/compute/kernels/sort.rs index 70efef4cf5d..e9a53738aa2 100644 --- a/rust/arrow/src/compute/kernels/sort.rs +++ b/rust/arrow/src/compute/kernels/sort.rs @@ -44,7 +44,7 @@ pub fn sort(values: &ArrayRef, options: Option) -> Result // implements comparison using IEEE 754 total ordering for f32 // Original implementation from https://doc.rust-lang.org/std/primitive.f64.html#method.total_cmp // TODO to change to use std when it becomes stable -fn total_cmp_32(l: f32, r: f32) -> std::cmp::Ordering { +pub(crate) fn total_cmp_32(l: f32, r: f32) -> std::cmp::Ordering { let mut left = l.to_bits() as i32; let mut right = r.to_bits() as i32; @@ -57,7 +57,7 @@ fn total_cmp_32(l: f32, r: f32) -> std::cmp::Ordering { // implements comparison using IEEE 754 total ordering for f64 // Original implementation from https://doc.rust-lang.org/std/primitive.f64.html#method.total_cmp // TODO to change to use std when it becomes stable -fn total_cmp_64(l: f64, r: f64) -> std::cmp::Ordering { +pub(crate) fn total_cmp_64(l: f64, r: f64) -> std::cmp::Ordering { let mut left = l.to_bits() as i64; let mut right = r.to_bits() as i64;