From 8cf84abc66c017c3b5091fd9f2dbfa5247ea356e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 5 Aug 2024 14:53:36 -0400 Subject: [PATCH 1/2] Minor: make it clearer that clone() is not slow --- datafusion/core/src/datasource/statistics.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs index 9d031a6bbc85..d284000eb0f2 100644 --- a/datafusion/core/src/datasource/statistics.rs +++ b/datafusion/core/src/datasource/statistics.rs @@ -93,10 +93,10 @@ pub async fn get_statistics_with_limit( // counts across all the files in question. If any file does not // provide any information or provides an inexact value, we demote // the statistic precision to inexact. - num_rows = add_row_stats(file_stats.num_rows.clone(), num_rows); + num_rows = add_row_stats(&file_stats.num_rows, &num_rows); total_byte_size = - add_row_stats(file_stats.total_byte_size.clone(), total_byte_size); + add_row_stats(&file_stats.total_byte_size, &total_byte_size); for (file_col_stats, col_stats) in file_stats .column_statistics @@ -110,8 +110,7 @@ pub async fn get_statistics_with_limit( distinct_count: _, } = file_col_stats; - col_stats.null_count = - add_row_stats(file_nc.clone(), col_stats.null_count.clone()); + col_stats.null_count = add_row_stats(file_nc, &col_stats.null_count); set_max_if_greater(file_max, &mut col_stats.max_value); set_min_if_lesser(file_min, &mut col_stats.min_value) } @@ -165,12 +164,12 @@ pub(crate) fn create_max_min_accs( } fn add_row_stats( - file_num_rows: Precision, - num_rows: Precision, + file_num_rows: &Precision, + num_rows: &Precision, ) -> Precision { match (file_num_rows, &num_rows) { - (Precision::Absent, _) => num_rows.to_inexact(), - (lhs, Precision::Absent) => lhs.to_inexact(), + (Precision::Absent, _) => num_rows.clone().to_inexact(), + (lhs, Precision::Absent) => lhs.clone().to_inexact(), (lhs, rhs) => lhs.add(rhs), } } From 7b71ea5af27e55467d1d7d021d5cd33ca61b72a6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Aug 2024 12:00:15 -0400 Subject: [PATCH 2/2] Make Precision Copy when T is Copy --- datafusion/common/src/stats.rs | 23 ++++++++++--- .../physical_plan/file_scan_config.rs | 2 +- datafusion/core/src/datasource/statistics.rs | 33 ++++++++++--------- datafusion/physical-expr/src/analysis.rs | 2 +- datafusion/physical-plan/src/filter.rs | 2 +- datafusion/physical-plan/src/joins/utils.rs | 10 +++--- 6 files changed, 43 insertions(+), 29 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 6cefef8d0eb5..9ece0bd6c0a2 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -25,7 +25,7 @@ use arrow_schema::Schema; /// Represents a value with a degree of certainty. `Precision` is used to /// propagate information the precision of statistical values. -#[derive(Clone, PartialEq, Eq, Default)] +#[derive(Clone, PartialEq, Eq, Default, Copy)] pub enum Precision { /// The exact value is known Exact(T), @@ -417,9 +417,9 @@ mod tests { let inexact_precision = Precision::Inexact(42); let absent_precision = Precision::::Absent; - assert_eq!(exact_precision.clone().to_inexact(), inexact_precision); - assert_eq!(inexact_precision.clone().to_inexact(), inexact_precision); - assert_eq!(absent_precision.clone().to_inexact(), absent_precision); + assert_eq!(exact_precision.to_inexact(), inexact_precision); + assert_eq!(inexact_precision.to_inexact(), inexact_precision); + assert_eq!(absent_precision.to_inexact(), absent_precision); } #[test] @@ -459,4 +459,19 @@ mod tests { assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15)); assert_eq!(precision1.multiply(&absent_precision), Precision::Absent); } + + #[test] + fn test_precision_cloning() { + // Precision is copy + let precision: Precision = Precision::Exact(42); + let p2 = precision; + assert_eq!(precision, p2); + + // Precision is not copy (requires .clone()) + let precision: Precision = + Precision::Exact(ScalarValue::Int64(Some(42))); + // Clippy would complain about this if it were Copy + let p2 = precision.clone(); + assert_eq!(precision, p2); + } } diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 17850ea7585a..34fb6226c1a2 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -244,7 +244,7 @@ impl FileScanConfig { } let table_stats = Statistics { - num_rows: self.statistics.num_rows.clone(), + num_rows: self.statistics.num_rows, // TODO correct byte size? total_byte_size: Precision::Absent, column_statistics: table_cols_stats, diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs index d284000eb0f2..33225da3b2d9 100644 --- a/datafusion/core/src/datasource/statistics.rs +++ b/datafusion/core/src/datasource/statistics.rs @@ -18,17 +18,18 @@ use std::mem; use std::sync::Arc; -use super::listing::PartitionedFile; -use crate::arrow::datatypes::{Schema, SchemaRef}; -use crate::error::Result; -use crate::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; -use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics}; use arrow_schema::DataType; +use futures::{Stream, StreamExt}; use datafusion_common::stats::Precision; use datafusion_common::ScalarValue; -use futures::{Stream, StreamExt}; +use crate::arrow::datatypes::{Schema, SchemaRef}; +use crate::error::Result; +use crate::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; +use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics}; + +use super::listing::PartitionedFile; /// Get all files as well as the file level summary statistics (no statistic for partition columns). /// If the optional `limit` is provided, includes only sufficient files. Needed to read up to @@ -62,8 +63,8 @@ pub async fn get_statistics_with_limit( result_files.push(file); // First file, we set them directly from the file statistics. - num_rows = file_stats.num_rows.clone(); - total_byte_size = file_stats.total_byte_size.clone(); + num_rows = file_stats.num_rows; + total_byte_size = file_stats.total_byte_size; for (index, file_column) in file_stats.column_statistics.clone().into_iter().enumerate() { @@ -93,10 +94,10 @@ pub async fn get_statistics_with_limit( // counts across all the files in question. If any file does not // provide any information or provides an inexact value, we demote // the statistic precision to inexact. - num_rows = add_row_stats(&file_stats.num_rows, &num_rows); + num_rows = add_row_stats(file_stats.num_rows, num_rows); total_byte_size = - add_row_stats(&file_stats.total_byte_size, &total_byte_size); + add_row_stats(file_stats.total_byte_size, total_byte_size); for (file_col_stats, col_stats) in file_stats .column_statistics @@ -110,7 +111,7 @@ pub async fn get_statistics_with_limit( distinct_count: _, } = file_col_stats; - col_stats.null_count = add_row_stats(file_nc, &col_stats.null_count); + col_stats.null_count = add_row_stats(*file_nc, col_stats.null_count); set_max_if_greater(file_max, &mut col_stats.max_value); set_min_if_lesser(file_min, &mut col_stats.min_value) } @@ -164,12 +165,12 @@ pub(crate) fn create_max_min_accs( } fn add_row_stats( - file_num_rows: &Precision, - num_rows: &Precision, + file_num_rows: Precision, + num_rows: Precision, ) -> Precision { match (file_num_rows, &num_rows) { - (Precision::Absent, _) => num_rows.clone().to_inexact(), - (lhs, Precision::Absent) => lhs.clone().to_inexact(), + (Precision::Absent, _) => num_rows.to_inexact(), + (lhs, Precision::Absent) => lhs.to_inexact(), (lhs, rhs) => lhs.add(rhs), } } @@ -191,7 +192,7 @@ pub(crate) fn get_col_stats( None => None, }; ColumnStatistics { - null_count: null_counts[i].clone(), + null_count: null_counts[i], max_value: max_value.map(Precision::Exact).unwrap_or(Precision::Absent), min_value: min_value.map(Precision::Exact).unwrap_or(Precision::Absent), distinct_count: Precision::Absent, diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs index bcf1c8e510b1..3eac62a4df08 100644 --- a/datafusion/physical-expr/src/analysis.rs +++ b/datafusion/physical-expr/src/analysis.rs @@ -119,7 +119,7 @@ impl ExprBoundaries { Ok(ExprBoundaries { column, interval, - distinct_count: col_stats.distinct_count.clone(), + distinct_count: col_stats.distinct_count, }) } diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 69bcfefcd476..aae46c6d9c06 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -324,7 +324,7 @@ fn collect_new_statistics( (Precision::Inexact(lower), Precision::Inexact(upper)) }; ColumnStatistics { - null_count: input_column_stats[idx].null_count.clone().to_inexact(), + null_count: input_column_stats[idx].null_count.to_inexact(), max_value, min_value, distinct_count: distinct_count.to_inexact(), diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index b8a58e4d0d30..80d8815bdebc 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -827,12 +827,12 @@ fn estimate_join_cardinality( JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { let ij_cardinality = estimate_inner_join_cardinality( Statistics { - num_rows: left_stats.num_rows.clone(), + num_rows: left_stats.num_rows, total_byte_size: Precision::Absent, column_statistics: left_col_stats, }, Statistics { - num_rows: right_stats.num_rows.clone(), + num_rows: right_stats.num_rows, total_byte_size: Precision::Absent, column_statistics: right_col_stats, }, @@ -1024,7 +1024,7 @@ fn max_distinct_count( stats: &ColumnStatistics, ) -> Precision { match &stats.distinct_count { - dc @ (Precision::Exact(_) | Precision::Inexact(_)) => dc.clone(), + &dc @ (Precision::Exact(_) | Precision::Inexact(_)) => dc, _ => { // The number can never be greater than the number of rows we have // minus the nulls (since they don't count as distinct values). @@ -2054,9 +2054,7 @@ mod tests { ); assert_eq!( partial_join_stats.map(|s| s.column_statistics), - expected_cardinality - .clone() - .map(|_| [left_col_stats, right_col_stats].concat()) + expected_cardinality.map(|_| [left_col_stats, right_col_stats].concat()) ); } Ok(())