remove new code

adriangb · adriangb · commit f4aa0962247e · 2025-05-16T21:35:25.000-04:00
diff --git a/datafusion/common/src/pruning.rs b/datafusion/common/src/pruning.rs
@@ -16,18 +16,12 @@
 // under the License.
 
 use std::collections::HashSet;
-use std::sync::Arc;
-
-use crate::stats::Precision;
-use arrow::array::UInt64Array;
-use arrow::datatypes::FieldRef;
 use arrow::{
     array::{ArrayRef, BooleanArray},
-    datatypes::{Schema, SchemaRef},
 };
 
 use crate::Column;
-use crate::{ScalarValue, Statistics};
+use crate::ScalarValue;
 
 /// A source of runtime statistical information to [`PruningPredicate`]s.
 ///
@@ -131,324 +125,3 @@ pub trait PruningStatistics {
     ) -> Option<BooleanArray>;
 }
 
-/// Prune files based on their partition values.
-/// This is used both at planning time and execution time to prune
-/// files based on their partition values.
-/// This feeds into [`CompositePruningStatistics`] to allow pruning
-/// with filters that depend both on partition columns and data columns
-/// (e.g. `WHERE partition_col = data_col`).
-pub struct PartitionPruningStatistics {
-    /// Values for each column for each container.
-    /// The outer vectors represent the columns while the inner
-    /// vectors represent the containers.
-    /// The order must match the order of the partition columns in
-    /// [`PartitionPruningStatistics::partition_schema`].
-    partition_values: Vec<Vec<ScalarValue>>,
-    /// The number of containers.
-    /// Stored since the partition values are column-major and if
-    /// there are no columns we wouldn't know the number of containers.
-    num_containers: usize,
-    /// The schema of the partition columns.
-    /// This must **not** be the schema of the entire file or table:
-    /// it must only be the schema of the partition columns,
-    /// in the same order as the values in [`PartitionPruningStatistics::partition_values`].
-    partition_schema: SchemaRef,
-}
-
-impl PartitionPruningStatistics {
-    /// Create a new instance of [`PartitionPruningStatistics`].
-    ///
-    /// Args:
-    /// * `partition_values`: A vector of vectors of [`ScalarValue`]s.
-    ///   The outer vector represents the containers while the inner
-    ///   vector represents the partition values for each column.
-    ///   Note that this is the **opposite** of the order of the
-    ///   partition columns in `PartitionPruningStatistics::partition_schema`.
-    /// * `partition_schema`: The schema of the partition columns.
-    ///   This must **not** be the schema of the entire file or table:
-    ///   instead it must only be the schema of the partition columns,
-    ///   in the same order as the values in `partition_values`.
-    pub fn new(
-        partition_values: Vec<Vec<ScalarValue>>,
-        partition_fields: Vec<FieldRef>,
-    ) -> Self {
-        let num_containers = partition_values.len();
-        let partition_schema = Arc::new(Schema::new(partition_fields));
-        let mut partition_valeus_by_column =
-            vec![vec![]; partition_schema.fields().len()];
-        for partition_value in partition_values.iter() {
-            for (i, value) in partition_value.iter().enumerate() {
-                partition_valeus_by_column[i].push(value.clone());
-            }
-        }
-        Self {
-            partition_values: partition_valeus_by_column,
-            num_containers,
-            partition_schema,
-        }
-    }
-}
-
-impl PruningStatistics for PartitionPruningStatistics {
-    fn min_values(&self, column: &Column) -> Option<ArrayRef> {
-        let index = self.partition_schema.index_of(column.name()).ok()?;
-        let partition_values = self.partition_values.get(index)?;
-        let mut values = Vec::with_capacity(self.partition_values.len());
-        for partition_value in partition_values {
-            match partition_value {
-                ScalarValue::Null => values.push(ScalarValue::Null),
-                _ => values.push(partition_value.clone()),
-            }
-        }
-        match ScalarValue::iter_to_array(values) {
-            Ok(array) => Some(array),
-            Err(_) => {
-                log::warn!(
-                    "Failed to convert min values to array for column {}",
-                    column.name()
-                );
-                None
-            }
-        }
-    }
-
-    fn max_values(&self, column: &Column) -> Option<ArrayRef> {
-        self.min_values(column)
-    }
-
-    fn num_containers(&self) -> usize {
-        self.num_containers
-    }
-
-    fn null_counts(&self, _column: &Column) -> Option<ArrayRef> {
-        None
-    }
-
-    fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
-        None
-    }
-
-    fn contained(
-        &self,
-        column: &Column,
-        values: &HashSet<ScalarValue>,
-    ) -> Option<BooleanArray> {
-        let index = self.partition_schema.index_of(column.name()).ok()?;
-        let partition_values = self.partition_values.get(index)?;
-        let mut contained = Vec::with_capacity(self.partition_values.len());
-        for partition_value in partition_values {
-            let contained_value = if values.contains(partition_value) {
-                Some(true)
-            } else {
-                Some(false)
-            };
-            contained.push(contained_value);
-        }
-        let array = BooleanArray::from(contained);
-        Some(array)
-    }
-}
-
-/// Prune a set of containers represented by their statistics.
-/// Each [`Statistics`] represents a container (e.g. a file or a partition of files).
-pub struct PrunableStatistics {
-    /// Statistics for each container.
-    statistics: Vec<Arc<Statistics>>,
-    /// The schema of the file these statistics are for.
-    schema: SchemaRef,
-}
-
-impl PrunableStatistics {
-    /// Create a new instance of [`PrunableStatistics`].
-    /// Each [`Statistics`] represents a container (e.g. a file or a partition of files).
-    /// The `schema` is the schema of the data in the containers and should apply to all files.
-    pub fn new(statistics: Vec<Arc<Statistics>>, schema: SchemaRef) -> Self {
-        Self { statistics, schema }
-    }
-}
-
-impl PruningStatistics for PrunableStatistics {
-    fn min_values(&self, column: &Column) -> Option<ArrayRef> {
-        let index = self.schema.index_of(column.name()).ok()?;
-        let mut values = Vec::with_capacity(self.statistics.len());
-        for stats in &self.statistics {
-            let stat = stats.column_statistics.get(index)?;
-            match &stat.min_value {
-                Precision::Exact(min) => {
-                    values.push(min.clone());
-                }
-                _ => values.push(ScalarValue::Null),
-            }
-        }
-        match ScalarValue::iter_to_array(values) {
-            Ok(array) => Some(array),
-            Err(_) => {
-                log::warn!(
-                    "Failed to convert min values to array for column {}",
-                    column.name()
-                );
-                None
-            }
-        }
-    }
-
-    fn max_values(&self, column: &Column) -> Option<ArrayRef> {
-        let index = self.schema.index_of(column.name()).ok()?;
-        let mut values = Vec::with_capacity(self.statistics.len());
-        for stats in &self.statistics {
-            let stat = stats.column_statistics.get(index)?;
-            match &stat.max_value {
-                Precision::Exact(max) => {
-                    values.push(max.clone());
-                }
-                _ => values.push(ScalarValue::Null),
-            }
-        }
-        match ScalarValue::iter_to_array(values) {
-            Ok(array) => Some(array),
-            Err(_) => {
-                log::warn!(
-                    "Failed to convert max values to array for column {}",
-                    column.name()
-                );
-                None
-            }
-        }
-    }
-
-    fn num_containers(&self) -> usize {
-        self.statistics.len()
-    }
-
-    fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
-        let index = self.schema.index_of(column.name()).ok()?;
-        let mut values = Vec::with_capacity(self.statistics.len());
-        let mut has_null_count = false;
-        for stats in &self.statistics {
-            let stat = stats.column_statistics.get(index)?;
-            match &stat.null_count {
-                Precision::Exact(null_count) => match u64::try_from(*null_count) {
-                    Ok(null_count) => {
-                        has_null_count = true;
-                        values.push(Some(null_count));
-                    }
-                    Err(_) => {
-                        values.push(None);
-                    }
-                },
-                _ => values.push(None),
-            }
-        }
-        if has_null_count {
-            Some(Arc::new(UInt64Array::from(values)))
-        } else {
-            None
-        }
-    }
-
-    fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
-        let mut values = Vec::with_capacity(self.statistics.len());
-        let mut has_row_count = false;
-        for stats in &self.statistics {
-            match &stats.num_rows {
-                Precision::Exact(row_count) => match u64::try_from(*row_count) {
-                    Ok(row_count) => {
-                        has_row_count = true;
-                        values.push(Some(row_count));
-                    }
-                    Err(_) => {
-                        values.push(None);
-                    }
-                },
-                _ => values.push(None),
-            }
-        }
-        if has_row_count {
-            Some(Arc::new(UInt64Array::from(values)))
-        } else {
-            None
-        }
-    }
-
-    fn contained(
-        &self,
-        _column: &Column,
-        _values: &HashSet<ScalarValue>,
-    ) -> Option<BooleanArray> {
-        None
-    }
-}
-
-/// Combine multiple [`PruningStatistics`] into a single
-/// [`CompositePruningStatistics`].
-/// This can be used to combine statistics from different sources,
-/// for example partition values and file statistics.
-/// This allows pruning with filters that depend on multiple sources of statistics,
-/// such as `WHERE partition_col = data_col`.
-pub struct CompositePruningStatistics {
-    pub statistics: Vec<Box<dyn PruningStatistics>>,
-}
-
-impl CompositePruningStatistics {
-    /// Create a new instance of [`CompositePruningStatistics`] from
-    /// a vector of [`PruningStatistics`].
-    pub fn new(statistics: Vec<Box<dyn PruningStatistics>>) -> Self {
-        assert!(!statistics.is_empty());
-        Self { statistics }
-    }
-}
-
-impl PruningStatistics for CompositePruningStatistics {
-    fn min_values(&self, column: &Column) -> Option<ArrayRef> {
-        for stats in &self.statistics {
-            if let Some(array) = stats.min_values(column) {
-                return Some(array);
-            }
-        }
-        None
-    }
-
-    fn max_values(&self, column: &Column) -> Option<ArrayRef> {
-        for stats in &self.statistics {
-            if let Some(array) = stats.max_values(column) {
-                return Some(array);
-            }
-        }
-        None
-    }
-
-    fn num_containers(&self) -> usize {
-        self.statistics[0].num_containers()
-    }
-
-    fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
-        for stats in &self.statistics {
-            if let Some(array) = stats.null_counts(column) {
-                return Some(array);
-            }
-        }
-        None
-    }
-
-    fn row_counts(&self, column: &Column) -> Option<ArrayRef> {
-        for stats in &self.statistics {
-            if let Some(array) = stats.row_counts(column) {
-                return Some(array);
-            }
-        }
-        None
-    }
-
-    fn contained(
-        &self,
-        column: &Column,
-        values: &HashSet<ScalarValue>,
-    ) -> Option<BooleanArray> {
-        for stats in &self.statistics {
-            if let Some(array) = stats.contained(column, values) {
-                return Some(array);
-            }
-        }
-        None
-    }
-}