You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
feat(optimizer): Enable filter pushdown on window functions (apache#14026)
* feat(optimizer): Enable filter pushdown on window functions
Ensures selections can be pushed past window functions similarly
to what is already done with aggregations, when possible.
* fix: Add missing dependency
* minor(optimizer): Use 'datafusion-functions-window' as a dev dependency
* docs(optimizer): Add example to filter pushdown on LogicalPlan::Window
(cherry picked from commit ad5a04f)
/// verifies that filters on partition expressions are not pushed, as the single expression
1473
+
/// column is not available to the user, unlike with aggregations
1474
+
#[test]
1475
+
fnfilter_expression_keep_window() -> Result<()>{
1476
+
let table_scan = test_table_scan()?;
1477
+
1478
+
let window = Expr::WindowFunction(WindowFunction::new(
1479
+
WindowFunctionDefinition::WindowUDF(
1480
+
datafusion_functions_window::rank::rank_udwf(),
1481
+
),
1482
+
vec![],
1483
+
))
1484
+
.partition_by(vec![add(col("a"), col("b"))])// PARTITION BY a + b
1485
+
.order_by(vec![col("c").sort(true,true)])
1486
+
.build()
1487
+
.unwrap();
1488
+
1489
+
let plan = LogicalPlanBuilder::from(table_scan)
1490
+
.window(vec![window])?
1491
+
// unlike with aggregations, single partition column "test.a + test.b" is not available
1492
+
// to the plan, so we use multiple columns when filtering
1493
+
.filter(add(col("a"),col("b")).gt(lit(10i64)))?
1494
+
.build()?;
1495
+
1496
+
let expected = "\
1497
+
Filter: test.a + test.b > Int64(10)\
1498
+
\n WindowAggr: windowExpr=[[rank() PARTITION BY [test.a + test.b] ORDER BY [test.c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\
1499
+
\n TableScan: test";
1500
+
assert_optimized_plan_eq(plan, expected)
1501
+
}
1502
+
1503
+
/// verifies that filters are not pushed on order by columns (that are not used in partitioning)
1504
+
#[test]
1505
+
fnfilter_order_keep_window() -> Result<()>{
1506
+
let table_scan = test_table_scan()?;
1507
+
1508
+
let window = Expr::WindowFunction(WindowFunction::new(
1509
+
WindowFunctionDefinition::WindowUDF(
1510
+
datafusion_functions_window::rank::rank_udwf(),
1511
+
),
1512
+
vec![],
1513
+
))
1514
+
.partition_by(vec![col("a")])
1515
+
.order_by(vec![col("c").sort(true,true)])
1516
+
.build()
1517
+
.unwrap();
1518
+
1519
+
let plan = LogicalPlanBuilder::from(table_scan)
1520
+
.window(vec![window])?
1521
+
.filter(col("c").gt(lit(10i64)))?
1522
+
.build()?;
1523
+
1524
+
let expected = "\
1525
+
Filter: test.c > Int64(10)\
1526
+
\n WindowAggr: windowExpr=[[rank() PARTITION BY [test.a] ORDER BY [test.c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\
1527
+
\n TableScan: test";
1528
+
assert_optimized_plan_eq(plan, expected)
1529
+
}
1530
+
1531
+
/// verifies that when we use multiple window functions with a common partition key, the filter
let window1 = Expr::WindowFunction(WindowFunction::new(
1538
+
WindowFunctionDefinition::WindowUDF(
1539
+
datafusion_functions_window::rank::rank_udwf(),
1540
+
),
1541
+
vec![],
1542
+
))
1543
+
.partition_by(vec![col("a")])
1544
+
.order_by(vec![col("c").sort(true,true)])
1545
+
.build()
1546
+
.unwrap();
1547
+
1548
+
let window2 = Expr::WindowFunction(WindowFunction::new(
1549
+
WindowFunctionDefinition::WindowUDF(
1550
+
datafusion_functions_window::rank::rank_udwf(),
1551
+
),
1552
+
vec![],
1553
+
))
1554
+
.partition_by(vec![col("b"), col("a")])
1555
+
.order_by(vec![col("c").sort(true,true)])
1556
+
.build()
1557
+
.unwrap();
1558
+
1559
+
let plan = LogicalPlanBuilder::from(table_scan)
1560
+
.window(vec![window1, window2])?
1561
+
.filter(col("a").gt(lit(10i64)))? // a appears in both window functions
1562
+
.build()?;
1563
+
1564
+
let expected = "\
1565
+
WindowAggr: windowExpr=[[rank() PARTITION BY [test.a] ORDER BY [test.c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() PARTITION BY [test.b, test.a] ORDER BY [test.c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\
let window1 = Expr::WindowFunction(WindowFunction::new(
1577
+
WindowFunctionDefinition::WindowUDF(
1578
+
datafusion_functions_window::rank::rank_udwf(),
1579
+
),
1580
+
vec![],
1581
+
))
1582
+
.partition_by(vec![col("a")])
1583
+
.order_by(vec![col("c").sort(true,true)])
1584
+
.build()
1585
+
.unwrap();
1586
+
1587
+
let window2 = Expr::WindowFunction(WindowFunction::new(
1588
+
WindowFunctionDefinition::WindowUDF(
1589
+
datafusion_functions_window::rank::rank_udwf(),
1590
+
),
1591
+
vec![],
1592
+
))
1593
+
.partition_by(vec![col("b"), col("a")])
1594
+
.order_by(vec![col("c").sort(true,true)])
1595
+
.build()
1596
+
.unwrap();
1597
+
1598
+
let plan = LogicalPlanBuilder::from(table_scan)
1599
+
.window(vec![window1, window2])?
1600
+
.filter(col("b").gt(lit(10i64)))? // b only appears in one window function
1601
+
.build()?;
1602
+
1603
+
let expected = "\
1604
+
Filter: test.b > Int64(10)\
1605
+
\n WindowAggr: windowExpr=[[rank() PARTITION BY [test.a] ORDER BY [test.c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() PARTITION BY [test.b, test.a] ORDER BY [test.c ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]\
1606
+
\n TableScan: test";
1607
+
assert_optimized_plan_eq(plan, expected)
1608
+
}
1609
+
1308
1610
/// verifies that a filter is pushed to before a projection, the filter expression is correctly re-written
0 commit comments