From f1702ba0b28aa61a09c3b838f355ad6363e5ffd9 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 19 Jun 2023 09:38:44 -0400 Subject: [PATCH 1/8] Add percentile_cont aggregation function Refactor median to use the PercentileContAccumulator with percentile 0.5 --- datafusion/expr/src/aggregate_function.rs | 13 + .../expr/src/type_coercion/aggregates.rs | 27 ++ datafusion/physical-expr/Cargo.toml | 1 + .../src/aggregate/approx_percentile_cont.rs | 2 +- .../physical-expr/src/aggregate/build_in.rs | 8 + .../physical-expr/src/aggregate/median.rs | 140 +-------- datafusion/physical-expr/src/aggregate/mod.rs | 1 + .../src/aggregate/percentile_cont.rs | 293 ++++++++++++++++++ .../physical-expr/src/expressions/mod.rs | 1 + 9 files changed, 351 insertions(+), 135 deletions(-) create mode 100644 datafusion/physical-expr/src/aggregate/percentile_cont.rs diff --git a/datafusion/expr/src/aggregate_function.rs b/datafusion/expr/src/aggregate_function.rs index 5b0676a815099..37cb9bfd7e0f5 100644 --- a/datafusion/expr/src/aggregate_function.rs +++ b/datafusion/expr/src/aggregate_function.rs @@ -61,6 +61,8 @@ pub enum AggregateFunction { CovariancePop, /// Correlation Correlation, + /// Continuous Percentile + PercentileCont, /// Approximate continuous percentile function ApproxPercentileCont, /// Approximate continuous percentile function with weight @@ -102,6 +104,7 @@ impl AggregateFunction { Covariance => "COVARIANCE", CovariancePop => "COVARIANCE_POP", Correlation => "CORRELATION", + PercentileCont => "PERCENTILE_CONT", ApproxPercentileCont => "APPROX_PERCENTILE_CONT", ApproxPercentileContWithWeight => "APPROX_PERCENTILE_CONT_WITH_WEIGHT", ApproxMedian => "APPROX_MEDIAN", @@ -152,6 +155,7 @@ impl FromStr for AggregateFunction { "var" => AggregateFunction::Variance, "var_pop" => AggregateFunction::VariancePop, "var_samp" => AggregateFunction::Variance, + "percentile_cont" => AggregateFunction::PercentileCont, // approximate "approx_distinct" => AggregateFunction::ApproxDistinct, "approx_median" => AggregateFunction::ApproxMedian, @@ -214,6 +218,7 @@ pub fn return_type( coerced_data_types[0].clone(), true, )))), + AggregateFunction::PercentileCont => Ok(coerced_data_types[0].clone()), AggregateFunction::ApproxPercentileCont => Ok(coerced_data_types[0].clone()), AggregateFunction::ApproxPercentileContWithWeight => { Ok(coerced_data_types[0].clone()) @@ -286,6 +291,14 @@ pub fn signature(fun: &AggregateFunction) -> Signature { AggregateFunction::Correlation => { Signature::uniform(2, NUMERICS.to_vec(), Volatility::Immutable) } + AggregateFunction::PercentileCont => Signature::one_of( + // Accept any numeric value paired with a float64 percentile + NUMERICS + .iter() + .map(|t| TypeSignature::Exact(vec![t.clone(), DataType::Float64])) + .collect(), + Volatility::Immutable, + ), AggregateFunction::ApproxPercentileCont => { // Accept any numeric value paired with a float64 percentile let with_tdigest_size = NUMERICS.iter().map(|t| { diff --git a/datafusion/expr/src/type_coercion/aggregates.rs b/datafusion/expr/src/type_coercion/aggregates.rs index 4f02bf3dfd2a3..6058d72442376 100644 --- a/datafusion/expr/src/type_coercion/aggregates.rs +++ b/datafusion/expr/src/type_coercion/aggregates.rs @@ -210,6 +210,24 @@ pub fn coerce_types( } Ok(input_types.to_vec()) } + AggregateFunction::PercentileCont => { + if !is_percentile_cont_supported_arg_type(&input_types[0]) { + return Err(DataFusionError::Plan(format!( + "The function {:?} does not support inputs of type {:?}.", + agg_fun, input_types[0] + ))); + } + let mut result = input_types.to_vec(); + if can_coerce_from(&DataType::Float64, &input_types[1]) { + result[1] = DataType::Float64; + } else { + return Err(DataFusionError::Plan(format!( + "Could not coerce the percent argument for {:?} to Float64. Was {:?}.", + agg_fun, input_types[1] + ))); + } + Ok(result) + } AggregateFunction::ApproxPercentileCont => { if !is_approx_percentile_cont_supported_arg_type(&input_types[0]) { return Err(DataFusionError::Plan(format!( @@ -527,6 +545,15 @@ pub fn is_integer_arg_type(arg_type: &DataType) -> bool { ) } +/// Return `true` if `arg_type` is of a [`DataType`] that the +/// [`AggregateFunction::PercentileCont`] aggregation can operate on. +pub fn is_percentile_cont_supported_arg_type(arg_type: &DataType) -> bool { + matches!( + arg_type, + arg_type if NUMERICS.contains(arg_type) + ) +} + /// Return `true` if `arg_type` is of a [`DataType`] that the /// [`AggregateFunction::ApproxPercentileCont`] aggregation can operate on. pub fn is_approx_percentile_cont_supported_arg_type(arg_type: &DataType) -> bool { diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index b851c00edc2ba..b641c7bde9bb0 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -67,6 +67,7 @@ regex = { version = "1.8", optional = true } sha2 = { version = "^0.10.1", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } uuid = { version = "^1.2", features = ["v4"] } +bigdecimal = "0.3.0" [dev-dependencies] criterion = "0.5" diff --git a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs index f0a44cc97a66d..60c39433dc99d 100644 --- a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs +++ b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs @@ -130,7 +130,7 @@ impl PartialEq for ApproxPercentileCont { } } -fn validate_input_percentile_expr(expr: &Arc) -> Result { +pub(crate) fn validate_input_percentile_expr(expr: &Arc) -> Result { // Extract the desired percentile literal let lit = expr .as_any() diff --git a/datafusion/physical-expr/src/aggregate/build_in.rs b/datafusion/physical-expr/src/aggregate/build_in.rs index 71ddf91315130..70f381916f9f4 100644 --- a/datafusion/physical-expr/src/aggregate/build_in.rs +++ b/datafusion/physical-expr/src/aggregate/build_in.rs @@ -229,6 +229,14 @@ pub fn create_aggregate_expr( "CORR(DISTINCT) aggregations are not available".to_string(), )); } + (AggregateFunction::PercentileCont, false) => Arc::new( + expressions::PercentileCont::try_new(input_phy_exprs, name, rt_type)?, + ), + (AggregateFunction::PercentileCont, true) => { + return Err(DataFusionError::NotImplemented( + "percentile_cont(DISTINCT) aggregations are not available".to_string(), + )); + } (AggregateFunction::ApproxPercentileCont, false) => { if input_phy_exprs.len() == 2 { Arc::new(expressions::ApproxPercentileCont::new( diff --git a/datafusion/physical-expr/src/aggregate/median.rs b/datafusion/physical-expr/src/aggregate/median.rs index 6f79c98a6c3a5..47ef33857d84f 100644 --- a/datafusion/physical-expr/src/aggregate/median.rs +++ b/datafusion/physical-expr/src/aggregate/median.rs @@ -17,13 +17,12 @@ //! # Median +use crate::aggregate::percentile_cont::PercentileContAccumulator; use crate::aggregate::utils::down_cast_any_ref; use crate::expressions::format_state_name; use crate::{AggregateExpr, PhysicalExpr}; -use arrow::array::{Array, ArrayRef, UInt32Array}; -use arrow::compute::sort_to_indices; use arrow::datatypes::{DataType, Field}; -use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_common::Result; use datafusion_expr::Accumulator; use std::any::Any; use std::sync::Arc; @@ -64,9 +63,10 @@ impl AggregateExpr for Median { } fn create_accumulator(&self) -> Result> { - Ok(Box::new(MedianAccumulator { + Ok(Box::new(PercentileContAccumulator { data_type: self.data_type.clone(), all_values: vec![], + percentile: 0.5, })) } @@ -104,136 +104,6 @@ impl PartialEq for Median { } } -#[derive(Debug)] -/// The median accumulator accumulates the raw input values -/// as `ScalarValue`s -/// -/// The intermediate state is represented as a List of those scalars -struct MedianAccumulator { - data_type: DataType, - all_values: Vec, -} - -impl Accumulator for MedianAccumulator { - fn state(&self) -> Result> { - let state = - ScalarValue::new_list(Some(self.all_values.clone()), self.data_type.clone()); - Ok(vec![state]) - } - - fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - assert_eq!(values.len(), 1); - let array = &values[0]; - - assert_eq!(array.data_type(), &self.data_type); - self.all_values.reserve(array.len()); - for index in 0..array.len() { - self.all_values - .push(ScalarValue::try_from_array(array, index)?); - } - - Ok(()) - } - - fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { - assert_eq!(states.len(), 1); - - let array = &states[0]; - assert!(matches!(array.data_type(), DataType::List(_))); - for index in 0..array.len() { - match ScalarValue::try_from_array(array, index)? { - ScalarValue::List(Some(mut values), _) => { - self.all_values.append(&mut values); - } - ScalarValue::List(None, _) => {} // skip empty state - v => { - return Err(DataFusionError::Internal(format!( - "unexpected state in median. Expected DataType::List, got {v:?}" - ))) - } - } - } - Ok(()) - } - - fn evaluate(&self) -> Result { - if !self.all_values.iter().any(|v| !v.is_null()) { - return ScalarValue::try_from(&self.data_type); - } - - // Create an array of all the non null values and find the - // sorted indexes - let array = ScalarValue::iter_to_array( - self.all_values - .iter() - // ignore null values - .filter(|v| !v.is_null()) - .cloned(), - )?; - - // find the mid point - let len = array.len(); - let mid = len / 2; - - // only sort up to the top size/2 elements - let limit = Some(mid + 1); - let options = None; - let indices = sort_to_indices(&array, options, limit)?; - - // pick the relevant indices in the original arrays - let result = if len >= 2 && len % 2 == 0 { - // even number of values, average the two mid points - let s1 = scalar_at_index(&array, &indices, mid - 1)?; - let s2 = scalar_at_index(&array, &indices, mid)?; - match s1.add(s2)? { - ScalarValue::Int8(Some(v)) => ScalarValue::Int8(Some(v / 2)), - ScalarValue::Int16(Some(v)) => ScalarValue::Int16(Some(v / 2)), - ScalarValue::Int32(Some(v)) => ScalarValue::Int32(Some(v / 2)), - ScalarValue::Int64(Some(v)) => ScalarValue::Int64(Some(v / 2)), - ScalarValue::UInt8(Some(v)) => ScalarValue::UInt8(Some(v / 2)), - ScalarValue::UInt16(Some(v)) => ScalarValue::UInt16(Some(v / 2)), - ScalarValue::UInt32(Some(v)) => ScalarValue::UInt32(Some(v / 2)), - ScalarValue::UInt64(Some(v)) => ScalarValue::UInt64(Some(v / 2)), - ScalarValue::Float32(Some(v)) => ScalarValue::Float32(Some(v / 2.0)), - ScalarValue::Float64(Some(v)) => ScalarValue::Float64(Some(v / 2.0)), - ScalarValue::Decimal128(Some(v), p, s) => { - ScalarValue::Decimal128(Some(v / 2), p, s) - } - v => { - return Err(DataFusionError::Internal(format!( - "Unsupported type in MedianAccumulator: {v:?}" - ))) - } - } - } else { - // odd number of values, pick that one - scalar_at_index(&array, &indices, mid)? - }; - - Ok(result) - } - - fn size(&self) -> usize { - std::mem::size_of_val(self) + ScalarValue::size_of_vec(&self.all_values) - - std::mem::size_of_val(&self.all_values) - + self.data_type.size() - - std::mem::size_of_val(&self.data_type) - } -} - -/// Given a returns `array[indicies[indicie_index]]` as a `ScalarValue` -fn scalar_at_index( - array: &dyn Array, - indices: &UInt32Array, - indicies_index: usize, -) -> Result { - let array_index = indices - .value(indicies_index) - .try_into() - .expect("Convert uint32 to usize"); - ScalarValue::try_from_array(array, array_index) -} - #[cfg(test)] mod tests { use super::*; @@ -242,7 +112,9 @@ mod tests { use crate::generic_test_op; use arrow::record_batch::RecordBatch; use arrow::{array::*, datatypes::*}; + use datafusion_common::DataFusionError; use datafusion_common::Result; + use datafusion_common::ScalarValue; #[test] fn median_decimal() -> Result<()> { diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 7d2316c532a0a..1cb312e5d81ca 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -46,6 +46,7 @@ pub(crate) mod min_max; pub mod build_in; mod hyperloglog; pub mod moving_min_max; +pub(crate) mod percentile_cont; pub mod row_accumulator; pub(crate) mod stats; pub(crate) mod stddev; diff --git a/datafusion/physical-expr/src/aggregate/percentile_cont.rs b/datafusion/physical-expr/src/aggregate/percentile_cont.rs new file mode 100644 index 0000000000000..06760be45ff53 --- /dev/null +++ b/datafusion/physical-expr/src/aggregate/percentile_cont.rs @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! # Continuous Percentile + +use crate::aggregate::approx_percentile_cont::validate_input_percentile_expr; +use crate::aggregate::utils::down_cast_any_ref; +use crate::expressions::format_state_name; +use crate::{AggregateExpr, PhysicalExpr}; +use arrow::array::{Array, ArrayRef}; +use arrow::compute::sort_to_indices; +use arrow::datatypes::{DataType, Field}; +use arrow_array::UInt32Array; +use bigdecimal::{BigDecimal, FromPrimitive, ToPrimitive}; +use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::Accumulator; +use std::any::Any; +use std::sync::Arc; + +/// PERCENTILE_CONT aggregate expression. This uses a lot of memory because all values need to be +/// stored in memory before a result can be computed. If an approximation is sufficient +/// then APPROX_PERCENTILE_CONT provides a much more efficient solution. +#[derive(Debug)] +pub struct PercentileCont { + name: String, + expr: Vec>, + percentile: f64, + data_type: DataType, +} + +impl PercentileCont { + /// Create a new PERCENTILE_CONT aggregate function + pub fn try_new( + expr: Vec>, + name: impl Into, + data_type: DataType, + ) -> Result { + // Arguments should be [ColumnExpr, DesiredPercentileLiteral] + debug_assert_eq!(expr.len(), 2); + + let percentile = validate_input_percentile_expr(&expr[1])?; + Ok(Self { + name: name.into(), + expr, + percentile, + data_type, + }) + } +} + +impl AggregateExpr for PercentileCont { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + Ok(Field::new(&self.name, self.data_type.clone(), true)) + } + + fn create_accumulator(&self) -> Result> { + Ok(Box::new(PercentileContAccumulator { + data_type: self.data_type.clone(), + all_values: vec![], + percentile: self.percentile, + })) + } + + fn state_fields(&self) -> Result> { + //Intermediate state is a list of the elements we have collected so far + let field = Field::new("item", self.data_type.clone(), true); + let data_type = DataType::List(Arc::new(field)); + + Ok(vec![Field::new( + format_state_name(&self.name, "percentile_cont"), + data_type, + true, + )]) + } + + fn expressions(&self) -> Vec> { + self.expr.clone() + } + + fn name(&self) -> &str { + &self.name + } +} + +impl PartialEq for PercentileCont { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| { + self.name == x.name + && self.percentile == x.percentile + && self.data_type == x.data_type + && self + .expr + .iter() + .zip(x.expr.iter()) + .all(|(this, other)| this.eq(other)) + }) + .unwrap_or(false) + } +} + +#[derive(Debug)] +/// The percentile_cont accumulator accumulates the raw input values +/// as `ScalarValue`s +/// +/// The intermediate state is represented as a List of those scalars +pub(crate) struct PercentileContAccumulator { + pub data_type: DataType, + pub all_values: Vec, + pub percentile: f64, +} + +impl Accumulator for PercentileContAccumulator { + fn state(&self) -> Result> { + let state = + ScalarValue::new_list(Some(self.all_values.clone()), self.data_type.clone()); + Ok(vec![state]) + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + let array = &values[0]; + + assert_eq!(array.data_type(), &self.data_type); + self.all_values.reserve(array.len()); + for index in 0..array.len() { + self.all_values + .push(ScalarValue::try_from_array(array, index)?); + } + + Ok(()) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + assert_eq!(states.len(), 1); + + let array = &states[0]; + assert!(matches!(array.data_type(), DataType::List(_))); + for index in 0..array.len() { + match ScalarValue::try_from_array(array, index)? { + ScalarValue::List(Some(mut values), _) => { + self.all_values.append(&mut values); + } + ScalarValue::List(None, _) => {} // skip empty state + v => { + return Err(DataFusionError::Internal(format!( + "unexpected state in percentile_cont. Expected DataType::List, got {v:?}" + ))) + } + } + } + Ok(()) + } + + fn evaluate(&self) -> Result { + if !self.all_values.iter().any(|v| !v.is_null()) { + return ScalarValue::try_from(&self.data_type); + } + + // Create an array of all the non null values and find the + // sorted indexes + let array = ScalarValue::iter_to_array( + self.all_values + .iter() + // ignore null values + .filter(|v| !v.is_null()) + .cloned(), + )?; + + // find the mid point + let len = array.len(); + let r = (len - 1) as f64 * self.percentile; + + let limit = Some(r.ceil() as usize + 1); + let options = None; + let indices = sort_to_indices(&array, options, limit)?; + + let r_lower = r.floor() as usize; + let r_upper = r.ceil() as usize; + + let result = if r_lower == r_upper { + // Exact value found, pick that one + scalar_at_index(&array, &indices, r_lower)? + } else { + // Interpolate between upper and lower values + let s_lower = scalar_at_index(&array, &indices, r_lower)?; + let s_upper = scalar_at_index(&array, &indices, r_upper)?; + + // Convert lower/upper values and percentile to BigDecimal + let big_lower = scalar_to_bigdecimal(&s_lower); + let big_upper = scalar_to_bigdecimal(&s_upper); + let big_percentile = BigDecimal::from_f64(self.percentile); + + if let (Some(big_s1), Some(big_s2), Some(big_percentile)) = + (big_lower, big_upper, big_percentile) + { + // Perform percentile calculation with BigDecimal values to preserve precision + let big_result = big_s1.clone() + (big_s2 - big_s1) * big_percentile; + + // Convert result back to column type + match array.data_type() { + DataType::Int8 => ScalarValue::Int8(big_result.to_i8()), + DataType::Int16 => ScalarValue::Int16(big_result.to_i16()), + DataType::Int32 => ScalarValue::Int32(big_result.to_i32()), + DataType::Int64 => ScalarValue::Int64(big_result.to_i64()), + DataType::UInt8 => ScalarValue::UInt8(big_result.to_u8()), + DataType::UInt16 => ScalarValue::UInt16(big_result.to_u16()), + DataType::UInt32 => ScalarValue::UInt32(big_result.to_u32()), + DataType::UInt64 => ScalarValue::UInt64(big_result.to_u64()), + DataType::Float16 => ScalarValue::Float32(big_result.to_f32()), + DataType::Float32 => ScalarValue::Float32(big_result.to_f32()), + DataType::Float64 => ScalarValue::Float64(big_result.to_f64()), + _ => ScalarValue::try_from(array.data_type())?, + } + } else if scalar_is_non_finite(&s_lower) || scalar_is_non_finite(&s_upper) { + // If upper or lower value is a non-finite float, use NaN instead of NULL + match array.data_type() { + DataType::Float32 => ScalarValue::Float32(Some(f32::NAN)), + DataType::Float64 => ScalarValue::Float64(Some(f64::NAN)), + _ => ScalarValue::try_from(array.data_type())?, + } + } else { + // Otherwise, NULL + ScalarValue::try_from(array.data_type())? + } + }; + + Ok(result) + } + + fn size(&self) -> usize { + std::mem::size_of_val(self) + ScalarValue::size_of_vec(&self.all_values) + - std::mem::size_of_val(&self.all_values) + + self.data_type.size() + - std::mem::size_of_val(&self.data_type) + } +} + +fn scalar_to_bigdecimal(v: &ScalarValue) -> Option { + match v { + ScalarValue::Int8(Some(v)) => Some(BigDecimal::from(*v)), + ScalarValue::Int16(Some(v)) => Some(BigDecimal::from(*v)), + ScalarValue::Int32(Some(v)) => Some(BigDecimal::from(*v)), + ScalarValue::Int64(Some(v)) => Some(BigDecimal::from(*v)), + ScalarValue::UInt8(Some(v)) => Some(BigDecimal::from(*v)), + ScalarValue::UInt16(Some(v)) => Some(BigDecimal::from(*v)), + ScalarValue::UInt32(Some(v)) => Some(BigDecimal::from(*v)), + ScalarValue::UInt64(Some(v)) => Some(BigDecimal::from(*v)), + ScalarValue::Float32(Some(v)) => BigDecimal::from_f32(*v), + ScalarValue::Float64(Some(v)) => BigDecimal::from_f64(*v), + _ => None, + } +} + +fn scalar_is_non_finite(v: &ScalarValue) -> bool { + match v { + ScalarValue::Float32(Some(v)) => !v.is_finite(), + ScalarValue::Float64(Some(v)) => !v.is_finite(), + _ => false, + } +} + +/// Given a returns `array[indicies[indicie_index]]` as a `ScalarValue` +fn scalar_at_index( + array: &dyn Array, + indices: &UInt32Array, + indicies_index: usize, +) -> Result { + let array_index = indices + .value(indicies_index) + .try_into() + .expect("Convert uint32 to usize"); + ScalarValue::try_from_array(array, array_index) +} diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 0ca132aefdac3..7ad12d24b24af 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -59,6 +59,7 @@ pub use crate::aggregate::grouping::Grouping; pub use crate::aggregate::median::Median; pub use crate::aggregate::min_max::{Max, Min}; pub use crate::aggregate::min_max::{MaxAccumulator, MinAccumulator}; +pub use crate::aggregate::percentile_cont::PercentileCont; pub use crate::aggregate::stats::StatsType; pub use crate::aggregate::stddev::{Stddev, StddevPop}; pub use crate::aggregate::sum::Sum; From e69ce4c1589cb148c0d0f7bab839242928213cae Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 19 Jun 2023 09:39:39 -0400 Subject: [PATCH 2/8] Add sqllogictest tests --- .../sqllogictests/test_files/aggregate.slt | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt index bf1ab2cbd1fb1..31b199db1a9df 100644 --- a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt +++ b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt @@ -355,6 +355,24 @@ SELECT median(c12) FROM aggregate_test_100 ---- 0.551390054439 +# csv_query_percentile_cont_1 +query I +SELECT percentile_cont(c2, 0.23) FROM aggregate_test_100 +---- +2 + +# csv_query_percentile_cont_2 +query I +SELECT percentile_cont(c6, 0.9) FROM aggregate_test_100 +---- +7384349873015760243 + +# csv_query_percentile_cont_3 +query R +SELECT percentile_cont(c12, 0.42) FROM aggregate_test_100 +---- +0.405942413398 + # median_i8 query I SELECT median(col_i8) FROM median_table @@ -421,6 +439,72 @@ SELECT median(col_f64_nan) FROM median_table ---- NaN +# percentile_cont_i8 +query I +SELECT percentile_cont(col_i8, 0.11) FROM median_table +---- +-128 + +# percentile_cont_i16 +query I +SELECT percentile_cont(col_i16, 0.22) FROM median_table +---- +-32768 + +# percentile_cont_i32 +query I +SELECT percentile_cont(col_i32, 0.33) FROM median_table +---- +-2147483648 + +# percentile_cont_i64 +query I +SELECT percentile_cont(col_i64, 0.44) FROM median_table +---- +-5165088340638674408 + +# percentile_cont_u8 +query I +SELECT percentile_cont(col_u8, 0.55) FROM median_table +---- +55 + +# percentile_cont_u16 +query I +SELECT percentile_cont(col_u16, 0.66) FROM median_table +---- +66 + +# percentile_cont_u32 +query I +SELECT percentile_cont(col_u32, 0.77) FROM median_table +---- +3307124840 + +# percentile_cont_u64 +query I +SELECT percentile_cont(col_u64, 0.88) FROM median_table +---- +16233134784864405433 + +# percentile_cont_f32 +query R +SELECT percentile_cont(col_f32, 0.9) FROM median_table +---- +4.29 + +# percentile_cont_f64 +query R +SELECT percentile_cont(col_f64, 0.96) FROM median_table +---- +4.356 + +# percentile_cont_f64_nan +query R +SELECT percentile_cont(col_f64_nan, 0.29) FROM median_table +---- +NaN + # approx_median_f64_nan query R SELECT approx_median(col_f64_nan) FROM median_table @@ -443,6 +527,12 @@ select host, median(usage) from cpu group by host; host0 90.1 host1 90.3 +query TRR rowsort +select host, percentile_cont(usage, 0.25), percentile_cont(usage, 0.75) from cpu group by host; +---- +host0 90.1 90.1 +host1 90.25 90.35 + statement ok drop table cpu; @@ -466,6 +556,11 @@ select median(usage) from cpu; ---- 90.2 +query RR +select percentile_cont(usage, 0.25), percentile_cont(usage, 0.75) from cpu; +---- +90.125 90.35 + statement ok drop table cpu; From 0c20fdff668863e13f38cc6b2e1aede05cf95de7 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 19 Jun 2023 09:40:18 -0400 Subject: [PATCH 3/8] Precision change in describe test --- datafusion/core/tests/dataframe/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 67c0363bf5ed4..8a3fcfbd10bfd 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -296,7 +296,7 @@ async fn describe() -> Result<()> { "| std | 2107.472815166704 | null | 2.8724780750809518 | 2.8724780750809518 | 2.8724780750809518 | 28.724780750809533 | 3.1597258182544645 | 29.012028558317645 | null | null | null | 0.5000342500942125 | 3.44808750051728 |", "| min | 0.0 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 01/01/09 | 0 | 2008-12-31T23:00:00 | 2009.0 | 1.0 |", "| max | 7299.0 | null | 9.0 | 9.0 | 9.0 | 90.0 | 9.899999618530273 | 90.89999999999999 | 12/31/10 | 9 | 2010-12-31T04:09:13.860 | 2010.0 | 12.0 |", - "| median | 3649.0 | null | 4.0 | 4.0 | 4.0 | 45.0 | 4.949999809265137 | 45.45 | null | null | null | 2009.0 | 7.0 |", + "| median | 3649.0 | null | 4.0 | 4.0 | 4.0 | 45.0 | 4.949999809265137 | 45.44999999999999 | null | null | null | 2009.0 | 7.0 |", "+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+", ]; assert_batches_eq!(expected, &describe_record_batch); From 4d63f768572e78db202696937cd98db2bd0811fa Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 19 Jun 2023 09:41:50 -0400 Subject: [PATCH 4/8] fmt --- .../physical-expr/src/aggregate/approx_percentile_cont.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs index 60c39433dc99d..dbebb065553a3 100644 --- a/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs +++ b/datafusion/physical-expr/src/aggregate/approx_percentile_cont.rs @@ -130,7 +130,9 @@ impl PartialEq for ApproxPercentileCont { } } -pub(crate) fn validate_input_percentile_expr(expr: &Arc) -> Result { +pub(crate) fn validate_input_percentile_expr( + expr: &Arc, +) -> Result { // Extract the desired percentile literal let lit = expr .as_any() From e2332a4070ad36f947389b15423ea85c03643590 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 19 Jun 2023 09:59:50 -0400 Subject: [PATCH 5/8] Add to protobuf --- datafusion/proto/proto/datafusion.proto | 1 + datafusion/proto/src/generated/pbjson.rs | 3 +++ datafusion/proto/src/generated/prost.rs | 3 +++ datafusion/proto/src/logical_plan/from_proto.rs | 1 + datafusion/proto/src/logical_plan/to_proto.rs | 4 ++++ 5 files changed, 12 insertions(+) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 9b05dea712943..149ba774565f1 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -599,6 +599,7 @@ enum AggregateFunction { // we append "_AGG" to obey name scoping rules. FIRST_VALUE_AGG = 24; LAST_VALUE_AGG = 25; + PERCENTILE_CONT = 26; } message AggregateExprNode { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 890fe7221a8e3..d4eee4f37f6f0 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -465,6 +465,7 @@ impl serde::Serialize for AggregateFunction { Self::BoolOr => "BOOL_OR", Self::FirstValueAgg => "FIRST_VALUE_AGG", Self::LastValueAgg => "LAST_VALUE_AGG", + Self::PercentileCont => "PERCENTILE_CONT", }; serializer.serialize_str(variant) } @@ -502,6 +503,7 @@ impl<'de> serde::Deserialize<'de> for AggregateFunction { "BOOL_OR", "FIRST_VALUE_AGG", "LAST_VALUE_AGG", + "PERCENTILE_CONT", ]; struct GeneratedVisitor; @@ -570,6 +572,7 @@ impl<'de> serde::Deserialize<'de> for AggregateFunction { "BOOL_OR" => Ok(AggregateFunction::BoolOr), "FIRST_VALUE_AGG" => Ok(AggregateFunction::FirstValueAgg), "LAST_VALUE_AGG" => Ok(AggregateFunction::LastValueAgg), + "PERCENTILE_CONT" => Ok(AggregateFunction::PercentileCont), _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index b1ae0058dcb2c..6cfe686381e2e 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2470,6 +2470,7 @@ pub enum AggregateFunction { /// we append "_AGG" to obey name scoping rules. FirstValueAgg = 24, LastValueAgg = 25, + PercentileCont = 26, } impl AggregateFunction { /// String value of the enum field names used in the ProtoBuf definition. @@ -2506,6 +2507,7 @@ impl AggregateFunction { AggregateFunction::BoolOr => "BOOL_OR", AggregateFunction::FirstValueAgg => "FIRST_VALUE_AGG", AggregateFunction::LastValueAgg => "LAST_VALUE_AGG", + AggregateFunction::PercentileCont => "PERCENTILE_CONT", } } /// Creates an enum from field names used in the ProtoBuf definition. @@ -2539,6 +2541,7 @@ impl AggregateFunction { "BOOL_OR" => Some(Self::BoolOr), "FIRST_VALUE_AGG" => Some(Self::FirstValueAgg), "LAST_VALUE_AGG" => Some(Self::LastValueAgg), + "PERCENTILE_CONT" => Some(Self::PercentileCont), _ => None, } } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index ab2985f448a81..95bb6ba0e7737 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -538,6 +538,7 @@ impl From for AggregateFunction { protobuf::AggregateFunction::Stddev => Self::Stddev, protobuf::AggregateFunction::StddevPop => Self::StddevPop, protobuf::AggregateFunction::Correlation => Self::Correlation, + protobuf::AggregateFunction::PercentileCont => Self::PercentileCont, protobuf::AggregateFunction::ApproxPercentileCont => { Self::ApproxPercentileCont } diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index dbf9432bdc2dc..2fb3be8f25193 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -381,6 +381,7 @@ impl From<&AggregateFunction> for protobuf::AggregateFunction { AggregateFunction::Stddev => Self::Stddev, AggregateFunction::StddevPop => Self::StddevPop, AggregateFunction::Correlation => Self::Correlation, + AggregateFunction::PercentileCont => Self::PercentileCont, AggregateFunction::ApproxPercentileCont => Self::ApproxPercentileCont, AggregateFunction::ApproxPercentileContWithWeight => { Self::ApproxPercentileContWithWeight @@ -667,6 +668,9 @@ impl TryFrom<&Expr> for protobuf::LogicalExprNode { AggregateFunction::ApproxMedian => { protobuf::AggregateFunction::ApproxMedian } + AggregateFunction::PercentileCont => { + protobuf::AggregateFunction::PercentileCont + } AggregateFunction::Grouping => protobuf::AggregateFunction::Grouping, AggregateFunction::Median => protobuf::AggregateFunction::Median, AggregateFunction::FirstValue => { From 45c5dc109d0b45acaa4da12f40bef73c06248433 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 19 Jun 2023 10:26:14 -0400 Subject: [PATCH 6/8] cargo update in datafusion-cli --- datafusion-cli/Cargo.lock | 91 ++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 9f6296b5b08c5..52b3811f16046 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -47,9 +47,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4f263788a35611fba42eb41ff811c5d0360c58b97402570312a350736e2542e" +checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9" [[package]] name = "android-tzdata" @@ -74,9 +74,9 @@ checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" [[package]] name = "arrayvec" -version = "0.7.2" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" @@ -640,6 +640,17 @@ dependencies = [ "vsimd", ] +[[package]] +name = "bigdecimal" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6773ddc0eafc0e509fb60e48dff7f450f8e674a0686ae8605e8d9901bd5eefa" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -905,9 +916,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58" +checksum = "03e69e28e9f7f77debdedbaafa2866e1de9ba56df55a8bd7cfc724c25a09987c" dependencies = [ "libc", ] @@ -1105,6 +1116,7 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-schema", + "bigdecimal", "blake2", "blake3", "chrono", @@ -1632,7 +1644,7 @@ checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http", "hyper", - "rustls 0.21.1", + "rustls 0.21.2", "tokio", "tokio-rustls 0.24.1", ] @@ -1738,9 +1750,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.63" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -2448,7 +2460,7 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.1", + "rustls 0.21.2", "rustls-pemfile", "serde", "serde_json", @@ -2518,9 +2530,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.1" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e" +checksum = "e32ca28af694bc1bbf399c33a516dbdf1c90090b8ab23c2bc24f834aa2247f5f" dependencies = [ "log", "ring", @@ -2530,9 +2542,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" dependencies = [ "openssl-probe", "rustls-pemfile", @@ -2685,9 +2697,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.96" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" +checksum = "bdf3bf93142acad5821c99197022e170842cdbc1c30482b98750c688c640842a" dependencies = [ "itoa", "ryu", @@ -2708,9 +2720,9 @@ dependencies = [ [[package]] name = "sha2" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" dependencies = [ "cfg-if", "cpufeatures", @@ -3027,7 +3039,7 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls 0.21.1", + "rustls 0.21.2", "tokio", ] @@ -3099,9 +3111,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" +checksum = "8803eee176538f94ae9a14b55b2804eb7e1441f8210b1c31290b3bccdccff73b" dependencies = [ "proc-macro2", "quote", @@ -3203,9 +3215,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.3.3" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2" +checksum = "0fa2982af2eec27de306107c027578ff7f423d65f7250e40ce0fea8f45248b81" dependencies = [ "getrandom", ] @@ -3234,11 +3246,10 @@ dependencies = [ [[package]] name = "want" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" dependencies = [ - "log", "try-lock", ] @@ -3250,9 +3261,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3260,9 +3271,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", @@ -3275,9 +3286,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.36" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" dependencies = [ "cfg-if", "js-sys", @@ -3287,9 +3298,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3297,9 +3308,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", @@ -3310,9 +3321,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.86" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-streams" @@ -3329,9 +3340,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.63" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" dependencies = [ "js-sys", "wasm-bindgen", From e401dd794c3a8d267bc9c44c833eb886e6cc57fa Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 19 Jun 2023 10:38:26 -0400 Subject: [PATCH 7/8] sort deps --- datafusion/physical-expr/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index b641c7bde9bb0..f54eb4be23e93 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -47,6 +47,7 @@ arrow = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } +bigdecimal = "0.3.0" blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } chrono = { version = "0.4.23", default-features = false } @@ -67,7 +68,6 @@ regex = { version = "1.8", optional = true } sha2 = { version = "^0.10.1", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } uuid = { version = "^1.2", features = ["v4"] } -bigdecimal = "0.3.0" [dev-dependencies] criterion = "0.5" From cec8c2ab8affc38c7b5080b4d2eeca323e48aed8 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 19 Jun 2023 11:00:43 -0400 Subject: [PATCH 8/8] Revert "Precision change in describe test" This reverts commit 0c20fdff668863e13f38cc6b2e1aede05cf95de7. --- datafusion/core/tests/dataframe/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 8a3fcfbd10bfd..67c0363bf5ed4 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -296,7 +296,7 @@ async fn describe() -> Result<()> { "| std | 2107.472815166704 | null | 2.8724780750809518 | 2.8724780750809518 | 2.8724780750809518 | 28.724780750809533 | 3.1597258182544645 | 29.012028558317645 | null | null | null | 0.5000342500942125 | 3.44808750051728 |", "| min | 0.0 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 01/01/09 | 0 | 2008-12-31T23:00:00 | 2009.0 | 1.0 |", "| max | 7299.0 | null | 9.0 | 9.0 | 9.0 | 90.0 | 9.899999618530273 | 90.89999999999999 | 12/31/10 | 9 | 2010-12-31T04:09:13.860 | 2010.0 | 12.0 |", - "| median | 3649.0 | null | 4.0 | 4.0 | 4.0 | 45.0 | 4.949999809265137 | 45.44999999999999 | null | null | null | 2009.0 | 7.0 |", + "| median | 3649.0 | null | 4.0 | 4.0 | 4.0 | 45.0 | 4.949999809265137 | 45.45 | null | null | null | 2009.0 | 7.0 |", "+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+", ]; assert_batches_eq!(expected, &describe_record_batch);