Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions datafusion/core/tests/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,12 +667,12 @@ async fn test_aggregate_with_pk2() -> Result<()> {
let df = df.filter(predicate)?;
assert_snapshot!(
physical_plan_to_string(&df).await,
@r###"
CoalesceBatchesExec: target_batch_size=8192
FilterExec: id@0 = 1 AND name@1 = a
AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]
@r"
AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[], ordering_mode=Sorted
CoalesceBatchesExec: target_batch_size=8192
FilterExec: id@0 = 1 AND name@1 = a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, looks like a small improvement (although removing the aggregate would also be possible in this case 🤔 )

DataSourceExec: partitions=1, partition_sizes=[1]
"###
"
);

// Since id and name are functionally dependant, we can use name among expression
Expand Down Expand Up @@ -716,12 +716,12 @@ async fn test_aggregate_with_pk3() -> Result<()> {
let df = df.select(vec![col("id"), col("name")])?;
assert_snapshot!(
physical_plan_to_string(&df).await,
@r###"
CoalesceBatchesExec: target_batch_size=8192
FilterExec: id@0 = 1
AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]
@r"
AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[], ordering_mode=PartiallySorted([0])
CoalesceBatchesExec: target_batch_size=8192
FilterExec: id@0 = 1
DataSourceExec: partitions=1, partition_sizes=[1]
"###
"
);

// Since id and name are functionally dependant, we can use name among expression
Expand Down Expand Up @@ -767,12 +767,12 @@ async fn test_aggregate_with_pk4() -> Result<()> {
// columns are not used.
assert_snapshot!(
physical_plan_to_string(&df).await,
@r###"
CoalesceBatchesExec: target_batch_size=8192
FilterExec: id@0 = 1
AggregateExec: mode=Single, gby=[id@0 as id], aggr=[]
@r"
AggregateExec: mode=Single, gby=[id@0 as id], aggr=[], ordering_mode=Sorted
CoalesceBatchesExec: target_batch_size=8192
FilterExec: id@0 = 1
DataSourceExec: partitions=1, partition_sizes=[1]
"###
"
);

let df_results = df.collect().await?;
Expand Down
71 changes: 68 additions & 3 deletions datafusion/optimizer/src/push_down_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -978,8 +978,11 @@ impl OptimizerRule for PushDownFilter {
let group_expr_columns = agg
.group_expr
.iter()
.map(|e| Ok(Column::from_qualified_name(e.schema_name().to_string())))
.collect::<Result<HashSet<_>>>()?;
.map(|e| {
let (relation, name) = e.qualified_name();
Column::new(relation, name)
})
.collect::<HashSet<_>>();

let predicates = split_conjunction_owned(filter.predicate);

Expand Down Expand Up @@ -1047,7 +1050,10 @@ impl OptimizerRule for PushDownFilter {
func.params
.partition_by
.iter()
.map(|c| Column::from_qualified_name(c.schema_name().to_string()))
.map(|c| {
let (relation, name) = c.qualified_name();
Column::new(relation, name)
})
.collect::<HashSet<_>>()
};
let potential_partition_keys = window
Expand Down Expand Up @@ -1567,6 +1573,30 @@ mod tests {
)
}

/// verifies that filters with unusual column names are pushed down through aggregate operators
#[test]
fn filter_move_agg_special() -> Result<()> {
let schema = Schema::new(vec![
Field::new("$a", DataType::UInt32, false),
Field::new("$b", DataType::UInt32, false),
Field::new("$c", DataType::UInt32, false),
]);
let table_scan = table_scan(Some("test"), &schema, None)?.build()?;

let plan = LogicalPlanBuilder::from(table_scan)
.aggregate(vec![col("$a")], vec![sum(col("$b")).alias("total_salary")])?
.filter(col("$a").gt(lit(10i64)))?
.build()?;
// filter of key aggregation is commutative
assert_optimized_plan_equal!(
plan,
@r"
Aggregate: groupBy=[[test.$a]], aggr=[[sum(test.$b) AS total_salary]]
TableScan: test, full_filters=[test.$a > Int64(10)]
"
)
}

#[test]
fn filter_complex_group_by() -> Result<()> {
let table_scan = test_table_scan()?;
Expand Down Expand Up @@ -1647,6 +1677,41 @@ mod tests {
)
}

/// verifies that filters with unusual identifier names are pushed down through window functions
#[test]
fn filter_window_special_identifier() -> Result<()> {
let schema = Schema::new(vec![
Field::new("$a", DataType::UInt32, false),
Field::new("$b", DataType::UInt32, false),
Field::new("$c", DataType::UInt32, false),
]);
let table_scan = table_scan(Some("test"), &schema, None)?.build()?;

let window = Expr::from(WindowFunction::new(
WindowFunctionDefinition::WindowUDF(
datafusion_functions_window::rank::rank_udwf(),
),
vec![],
))
.partition_by(vec![col("$a"), col("$b")])
.order_by(vec![col("$c").sort(true, true)])
.build()
.unwrap();

let plan = LogicalPlanBuilder::from(table_scan)
.window(vec![window])?
.filter(col("$b").gt(lit(10i64)))?
.build()?;

assert_optimized_plan_equal!(
plan,
@r"
WindowAggr: windowExpr=[[rank() PARTITION BY [test.$a, test.$b] ORDER BY [test.$c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
TableScan: test, full_filters=[test.$b > Int64(10)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice to see this pushed down

"
)
}

/// verifies that when partitioning by 'a' and 'b', and filtering by 'a' and 'b', both 'a' and
/// 'b' are pushed
#[test]
Expand Down
Loading