Skip to content

Commit 46d3f52

Browse files
authored
Move PruningStatistics into datafusion::common (#16069)
* Move PruningStatistics into datafusion::common * fix doc * remove new code * fmt
1 parent febc77e commit 46d3f52

File tree

7 files changed

+134
-104
lines changed

7 files changed

+134
-104
lines changed

datafusion-examples/examples/parquet_index.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use arrow::datatypes::{Int32Type, SchemaRef};
2323
use arrow::util::pretty::pretty_format_batches;
2424
use async_trait::async_trait;
2525
use datafusion::catalog::Session;
26+
use datafusion::common::pruning::PruningStatistics;
2627
use datafusion::common::{
2728
internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue,
2829
};
@@ -39,7 +40,7 @@ use datafusion::parquet::arrow::{
3940
arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter,
4041
};
4142
use datafusion::physical_expr::PhysicalExpr;
42-
use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
43+
use datafusion::physical_optimizer::pruning::PruningPredicate;
4344
use datafusion::physical_plan::ExecutionPlan;
4445
use datafusion::prelude::*;
4546
use std::any::Any;

datafusion-examples/examples/pruning.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@ use std::sync::Arc;
2020

2121
use arrow::array::{ArrayRef, BooleanArray, Int32Array};
2222
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
23+
use datafusion::common::pruning::PruningStatistics;
2324
use datafusion::common::{DFSchema, ScalarValue};
2425
use datafusion::execution::context::ExecutionProps;
2526
use datafusion::physical_expr::create_physical_expr;
26-
use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
27+
use datafusion::physical_optimizer::pruning::PruningPredicate;
2728
use datafusion::prelude::*;
2829

2930
/// This example shows how to use DataFusion's `PruningPredicate` to prove

datafusion/common/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ pub mod format;
4747
pub mod hash_utils;
4848
pub mod instant;
4949
pub mod parsers;
50+
pub mod pruning;
5051
pub mod rounding;
5152
pub mod scalar;
5253
pub mod spans;

datafusion/common/src/pruning.rs

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{ArrayRef, BooleanArray};
19+
use std::collections::HashSet;
20+
21+
use crate::Column;
22+
use crate::ScalarValue;
23+
24+
/// A source of runtime statistical information to [`PruningPredicate`]s.
25+
///
26+
/// # Supported Information
27+
///
28+
/// 1. Minimum and maximum values for columns
29+
///
30+
/// 2. Null counts and row counts for columns
31+
///
32+
/// 3. Whether the values in a column are contained in a set of literals
33+
///
34+
/// # Vectorized Interface
35+
///
36+
/// Information for containers / files are returned as Arrow [`ArrayRef`], so
37+
/// the evaluation happens once on a single `RecordBatch`, which amortizes the
38+
/// overhead of evaluating the predicate. This is important when pruning 1000s
39+
/// of containers which often happens in analytic systems that have 1000s of
40+
/// potential files to consider.
41+
///
42+
/// For example, for the following three files with a single column `a`:
43+
/// ```text
44+
/// file1: column a: min=5, max=10
45+
/// file2: column a: No stats
46+
/// file2: column a: min=20, max=30
47+
/// ```
48+
///
49+
/// PruningStatistics would return:
50+
///
51+
/// ```text
52+
/// min_values("a") -> Some([5, Null, 20])
53+
/// max_values("a") -> Some([10, Null, 30])
54+
/// min_values("X") -> None
55+
/// ```
56+
///
57+
/// [`PruningPredicate`]: https://docs.rs/datafusion/latest/datafusion/physical_optimizer/pruning/struct.PruningPredicate.html
58+
pub trait PruningStatistics {
59+
/// Return the minimum values for the named column, if known.
60+
///
61+
/// If the minimum value for a particular container is not known, the
62+
/// returned array should have `null` in that row. If the minimum value is
63+
/// not known for any row, return `None`.
64+
///
65+
/// Note: the returned array must contain [`Self::num_containers`] rows
66+
fn min_values(&self, column: &Column) -> Option<ArrayRef>;
67+
68+
/// Return the maximum values for the named column, if known.
69+
///
70+
/// See [`Self::min_values`] for when to return `None` and null values.
71+
///
72+
/// Note: the returned array must contain [`Self::num_containers`] rows
73+
fn max_values(&self, column: &Column) -> Option<ArrayRef>;
74+
75+
/// Return the number of containers (e.g. Row Groups) being pruned with
76+
/// these statistics.
77+
///
78+
/// This value corresponds to the size of the [`ArrayRef`] returned by
79+
/// [`Self::min_values`], [`Self::max_values`], [`Self::null_counts`],
80+
/// and [`Self::row_counts`].
81+
fn num_containers(&self) -> usize;
82+
83+
/// Return the number of null values for the named column as an
84+
/// [`UInt64Array`]
85+
///
86+
/// See [`Self::min_values`] for when to return `None` and null values.
87+
///
88+
/// Note: the returned array must contain [`Self::num_containers`] rows
89+
///
90+
/// [`UInt64Array`]: arrow::array::UInt64Array
91+
fn null_counts(&self, column: &Column) -> Option<ArrayRef>;
92+
93+
/// Return the number of rows for the named column in each container
94+
/// as an [`UInt64Array`].
95+
///
96+
/// See [`Self::min_values`] for when to return `None` and null values.
97+
///
98+
/// Note: the returned array must contain [`Self::num_containers`] rows
99+
///
100+
/// [`UInt64Array`]: arrow::array::UInt64Array
101+
fn row_counts(&self, column: &Column) -> Option<ArrayRef>;
102+
103+
/// Returns [`BooleanArray`] where each row represents information known
104+
/// about specific literal `values` in a column.
105+
///
106+
/// For example, Parquet Bloom Filters implement this API to communicate
107+
/// that `values` are known not to be present in a Row Group.
108+
///
109+
/// The returned array has one row for each container, with the following
110+
/// meanings:
111+
/// * `true` if the values in `column` ONLY contain values from `values`
112+
/// * `false` if the values in `column` are NOT ANY of `values`
113+
/// * `null` if the neither of the above holds or is unknown.
114+
///
115+
/// If these statistics can not determine column membership for any
116+
/// container, return `None` (the default).
117+
///
118+
/// Note: the returned array must contain [`Self::num_containers`] rows
119+
fn contained(
120+
&self,
121+
column: &Column,
122+
values: &HashSet<ScalarValue>,
123+
) -> Option<BooleanArray>;
124+
}

datafusion/datasource-parquet/src/page_filter.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@ use arrow::{
2828
array::ArrayRef,
2929
datatypes::{Schema, SchemaRef},
3030
};
31+
use datafusion_common::pruning::PruningStatistics;
3132
use datafusion_common::ScalarValue;
3233
use datafusion_physical_expr::{split_conjunction, PhysicalExpr};
33-
use datafusion_physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
34+
use datafusion_physical_optimizer::pruning::PruningPredicate;
3435

3536
use log::{debug, trace};
3637
use parquet::arrow::arrow_reader::statistics::StatisticsConverter;

datafusion/datasource-parquet/src/row_group_filter.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,10 @@ use std::sync::Arc;
2121
use super::{ParquetAccessPlan, ParquetFileMetrics};
2222
use arrow::array::{ArrayRef, BooleanArray};
2323
use arrow::datatypes::Schema;
24+
use datafusion_common::pruning::PruningStatistics;
2425
use datafusion_common::{Column, Result, ScalarValue};
2526
use datafusion_datasource::FileRange;
26-
use datafusion_physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
27+
use datafusion_physical_optimizer::pruning::PruningPredicate;
2728
use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
2829
use parquet::arrow::parquet_column;
2930
use parquet::basic::Type;

datafusion/physical-optimizer/src/pruning.rs

Lines changed: 1 addition & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ use arrow::{
2828
datatypes::{DataType, Field, Schema, SchemaRef},
2929
record_batch::{RecordBatch, RecordBatchOptions},
3030
};
31+
use datafusion_common::pruning::PruningStatistics;
3132
use log::{debug, trace};
3233

3334
use datafusion_common::error::{DataFusionError, Result};
@@ -44,106 +45,6 @@ use datafusion_physical_expr::{expressions as phys_expr, PhysicalExprRef};
4445
use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr;
4546
use datafusion_physical_plan::{ColumnarValue, PhysicalExpr};
4647

47-
/// A source of runtime statistical information to [`PruningPredicate`]s.
48-
///
49-
/// # Supported Information
50-
///
51-
/// 1. Minimum and maximum values for columns
52-
///
53-
/// 2. Null counts and row counts for columns
54-
///
55-
/// 3. Whether the values in a column are contained in a set of literals
56-
///
57-
/// # Vectorized Interface
58-
///
59-
/// Information for containers / files are returned as Arrow [`ArrayRef`], so
60-
/// the evaluation happens once on a single `RecordBatch`, which amortizes the
61-
/// overhead of evaluating the predicate. This is important when pruning 1000s
62-
/// of containers which often happens in analytic systems that have 1000s of
63-
/// potential files to consider.
64-
///
65-
/// For example, for the following three files with a single column `a`:
66-
/// ```text
67-
/// file1: column a: min=5, max=10
68-
/// file2: column a: No stats
69-
/// file2: column a: min=20, max=30
70-
/// ```
71-
///
72-
/// PruningStatistics would return:
73-
///
74-
/// ```text
75-
/// min_values("a") -> Some([5, Null, 20])
76-
/// max_values("a") -> Some([10, Null, 30])
77-
/// min_values("X") -> None
78-
/// ```
79-
pub trait PruningStatistics {
80-
/// Return the minimum values for the named column, if known.
81-
///
82-
/// If the minimum value for a particular container is not known, the
83-
/// returned array should have `null` in that row. If the minimum value is
84-
/// not known for any row, return `None`.
85-
///
86-
/// Note: the returned array must contain [`Self::num_containers`] rows
87-
fn min_values(&self, column: &Column) -> Option<ArrayRef>;
88-
89-
/// Return the maximum values for the named column, if known.
90-
///
91-
/// See [`Self::min_values`] for when to return `None` and null values.
92-
///
93-
/// Note: the returned array must contain [`Self::num_containers`] rows
94-
fn max_values(&self, column: &Column) -> Option<ArrayRef>;
95-
96-
/// Return the number of containers (e.g. Row Groups) being pruned with
97-
/// these statistics.
98-
///
99-
/// This value corresponds to the size of the [`ArrayRef`] returned by
100-
/// [`Self::min_values`], [`Self::max_values`], [`Self::null_counts`],
101-
/// and [`Self::row_counts`].
102-
fn num_containers(&self) -> usize;
103-
104-
/// Return the number of null values for the named column as an
105-
/// [`UInt64Array`]
106-
///
107-
/// See [`Self::min_values`] for when to return `None` and null values.
108-
///
109-
/// Note: the returned array must contain [`Self::num_containers`] rows
110-
///
111-
/// [`UInt64Array`]: arrow::array::UInt64Array
112-
fn null_counts(&self, column: &Column) -> Option<ArrayRef>;
113-
114-
/// Return the number of rows for the named column in each container
115-
/// as an [`UInt64Array`].
116-
///
117-
/// See [`Self::min_values`] for when to return `None` and null values.
118-
///
119-
/// Note: the returned array must contain [`Self::num_containers`] rows
120-
///
121-
/// [`UInt64Array`]: arrow::array::UInt64Array
122-
fn row_counts(&self, column: &Column) -> Option<ArrayRef>;
123-
124-
/// Returns [`BooleanArray`] where each row represents information known
125-
/// about specific literal `values` in a column.
126-
///
127-
/// For example, Parquet Bloom Filters implement this API to communicate
128-
/// that `values` are known not to be present in a Row Group.
129-
///
130-
/// The returned array has one row for each container, with the following
131-
/// meanings:
132-
/// * `true` if the values in `column` ONLY contain values from `values`
133-
/// * `false` if the values in `column` are NOT ANY of `values`
134-
/// * `null` if the neither of the above holds or is unknown.
135-
///
136-
/// If these statistics can not determine column membership for any
137-
/// container, return `None` (the default).
138-
///
139-
/// Note: the returned array must contain [`Self::num_containers`] rows
140-
fn contained(
141-
&self,
142-
column: &Column,
143-
values: &HashSet<ScalarValue>,
144-
) -> Option<BooleanArray>;
145-
}
146-
14748
/// Used to prove that arbitrary predicates (boolean expression) can not
14849
/// possibly evaluate to `true` given information about a column provided by
14950
/// [`PruningStatistics`].

0 commit comments

Comments
 (0)