From 0f8e9a8cf7c327468645818a0fba3e736b9e512a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 09:49:21 +0200 Subject: [PATCH 01/22] Filter null keys by default --- datafusion/common/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 19978e102cc8..1e1b601c190e 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -514,7 +514,7 @@ config_namespace! { /// a nullable and non-nullable column to filter out nulls on the nullable side. This /// filter can add additional overhead when the file format does not fully support /// predicate push down. - pub filter_null_join_keys: bool, default = false + pub filter_null_join_keys: bool, default = true /// Should DataFusion repartition data using the aggregate keys to execute aggregates /// in parallel using the provided `target_partitions` level From 16247408ccfafb910af94fe37417eaf8f43efbd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 10:10:43 +0200 Subject: [PATCH 02/22] null_equals_null --- datafusion/optimizer/src/filter_null_join_keys.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/filter_null_join_keys.rs b/datafusion/optimizer/src/filter_null_join_keys.rs index 01e3d27c580f..c5beb3709281 100644 --- a/datafusion/optimizer/src/filter_null_join_keys.rs +++ b/datafusion/optimizer/src/filter_null_join_keys.rs @@ -50,7 +50,7 @@ impl OptimizerRule for FilterNullJoinKeys { return Ok(Transformed::no(plan)); } match plan { - LogicalPlan::Join(mut join) if !join.on.is_empty() => { + LogicalPlan::Join(mut join) if !join.on.is_empty() && !join.null_equals_null() => { let (left_preserved, right_preserved) = on_lr_is_preserved(join.join_type); From 3de10179788ee8da04fb6899f490a72f2e341c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 10:11:34 +0200 Subject: [PATCH 03/22] Docs --- docs/source/user-guide/cli/usage.md | 4 ++-- docs/source/user-guide/configs.md | 2 +- docs/source/user-guide/sql/information_schema.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/user-guide/cli/usage.md b/docs/source/user-guide/cli/usage.md index 6a620fc69252..ecc8ade83327 100644 --- a/docs/source/user-guide/cli/usage.md +++ b/docs/source/user-guide/cli/usage.md @@ -138,7 +138,7 @@ Show configuration options | datafusion.execution.time_zone | UTC | | datafusion.explain.logical_plan_only | false | | datafusion.explain.physical_plan_only | false | -| datafusion.optimizer.filter_null_join_keys | false | +| datafusion.optimizer.filter_null_join_keys | true | | datafusion.optimizer.skip_failed_rules | true | +-------------------------------------------------+---------+ @@ -191,7 +191,7 @@ DataFusion CLI v12.0.0 | datafusion.execution.time_zone | UTC | | datafusion.explain.logical_plan_only | false | | datafusion.explain.physical_plan_only | false | -| datafusion.optimizer.filter_null_join_keys | false | +| datafusion.optimizer.filter_null_join_keys | true | | datafusion.optimizer.skip_failed_rules | true | +-------------------------------------------------+---------+ 8 rows in set. Query took 0.002 seconds. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 8514fb1fbd93..975cbd42c93b 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -94,7 +94,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | | datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | | datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | -| datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | +| datafusion.optimizer.filter_null_join_keys | true | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | | datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | | datafusion.optimizer.repartition_joins | true | Should DataFusion repartition data using the join keys to execute joins in parallel using the provided `target_partitions` level | diff --git a/docs/source/user-guide/sql/information_schema.md b/docs/source/user-guide/sql/information_schema.md index bf4aa00e1dde..bd70ffce6ce3 100644 --- a/docs/source/user-guide/sql/information_schema.md +++ b/docs/source/user-guide/sql/information_schema.md @@ -65,7 +65,7 @@ select * from information_schema.df_settings; | datafusion.execution.time_zone | UTC | | datafusion.explain.logical_plan_only | false | | datafusion.explain.physical_plan_only | false | -| datafusion.optimizer.filter_null_join_keys | false | +| datafusion.optimizer.filter_null_join_keys | true | | datafusion.optimizer.skip_failed_rules | true | +-------------------------------------------------+---------+ ``` From 03784e25bf99b880dbb6db93a7381c89a23ad980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 10:24:11 +0200 Subject: [PATCH 04/22] Update filter_null_join_keys.rs --- datafusion/optimizer/src/filter_null_join_keys.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/filter_null_join_keys.rs b/datafusion/optimizer/src/filter_null_join_keys.rs index c5beb3709281..66e24f5dee7c 100644 --- a/datafusion/optimizer/src/filter_null_join_keys.rs +++ b/datafusion/optimizer/src/filter_null_join_keys.rs @@ -50,7 +50,7 @@ impl OptimizerRule for FilterNullJoinKeys { return Ok(Transformed::no(plan)); } match plan { - LogicalPlan::Join(mut join) if !join.on.is_empty() && !join.null_equals_null() => { + LogicalPlan::Join(mut join) if !join.on.is_empty() && !join.null_equals_null => { let (left_preserved, right_preserved) = on_lr_is_preserved(join.join_type); From b45a74ffe9b8556e64d72a3fa16ad66acdd7edee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 15:40:22 +0200 Subject: [PATCH 05/22] Docs --- datafusion/sqllogictest/test_files/information_schema.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index f797a7a6539d..a81f196d0a7e 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -225,7 +225,7 @@ datafusion.optimizer.default_filter_selectivity 20 datafusion.optimizer.enable_distinct_aggregation_soft_limit true datafusion.optimizer.enable_round_robin_repartition true datafusion.optimizer.enable_topk_aggregation true -datafusion.optimizer.filter_null_join_keys false +datafusion.optimizer.filter_null_join_keys true datafusion.optimizer.hash_join_single_partition_threshold 1048576 datafusion.optimizer.hash_join_single_partition_threshold_rows 131072 datafusion.optimizer.max_passes 3 @@ -314,7 +314,7 @@ datafusion.optimizer.default_filter_selectivity 20 The default filter selectivit datafusion.optimizer.enable_distinct_aggregation_soft_limit true When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. datafusion.optimizer.enable_round_robin_repartition true When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores datafusion.optimizer.enable_topk_aggregation true When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible -datafusion.optimizer.filter_null_join_keys false When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. +datafusion.optimizer.filter_null_join_keys true When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. datafusion.optimizer.hash_join_single_partition_threshold 1048576 The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition datafusion.optimizer.hash_join_single_partition_threshold_rows 131072 The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition datafusion.optimizer.max_passes 3 Number of times that the optimizer will attempt to optimize the plan From df9e3db4ea00f6c4b068e159173f5ade342c20c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 15:45:53 +0200 Subject: [PATCH 06/22] Wip --- .../optimizer/src/filter_null_join_keys.rs | 4 ++- .../sqllogictest/test_files/group_by.slt | 36 ++++++++++--------- datafusion/sqllogictest/test_files/join.slt | 18 ++++++---- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/datafusion/optimizer/src/filter_null_join_keys.rs b/datafusion/optimizer/src/filter_null_join_keys.rs index 66e24f5dee7c..c48012e211ec 100644 --- a/datafusion/optimizer/src/filter_null_join_keys.rs +++ b/datafusion/optimizer/src/filter_null_join_keys.rs @@ -50,7 +50,9 @@ impl OptimizerRule for FilterNullJoinKeys { return Ok(Transformed::no(plan)); } match plan { - LogicalPlan::Join(mut join) if !join.on.is_empty() && !join.null_equals_null => { + LogicalPlan::Join(mut join) + if !join.on.is_empty() && !join.null_equals_null => + { let (left_preserved, right_preserved) = on_lr_is_preserved(join.join_type); diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 73bfd9844609..9eb7129027a7 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2009,23 +2009,27 @@ logical_plan 03)----Aggregate: groupBy=[[l.col0, l.col1, l.col2]], aggr=[[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]]] 04)------Inner Join: l.col0 = r.col0 05)--------SubqueryAlias: l -06)----------TableScan: tab0 projection=[col0, col1, col2] -07)--------SubqueryAlias: r -08)----------TableScan: tab0 projection=[col0, col1] +06)----------Filter: tab0.col0 IS NOT NULL +07)------------TableScan: tab0 projection=[col0, col1, col2] +08)--------SubqueryAlias: r +09)----------Filter: tab0.col0 IS NOT NULL +10)------------TableScan: tab0 projection=[col0, col1] physical_plan -01)SortPreservingMergeExec: [col0@0 ASC NULLS LAST] -02)--SortExec: expr=[col0@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[col0@0 as col0, last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]@3 as last_col1] -04)------AggregateExec: mode=FinalPartitioned, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]] -05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4 -07)------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]] -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1] -10)------------------CoalesceBatchesExec: target_batch_size=8192 -11)--------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col0@0, col0@0)] -12)----------------------MemoryExec: partitions=1, partition_sizes=[3] -13)----------------------MemoryExec: partitions=1, partition_sizes=[3] +08)--------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1] +09)----------------CoalesceBatchesExec: target_batch_size=8192 +10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col0@0, col0@0)] +11)--------------------CoalesceBatchesExec: target_batch_size=8192 +12)----------------------RepartitionExec: partitioning=Hash([col0@0], 4), input_partitions=4 +13)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +14)--------------------------CoalesceBatchesExec: target_batch_size=8192 +15)----------------------------FilterExec: col0@0 IS NOT NULL +16)------------------------------MemoryExec: partitions=1, partition_sizes=[3] +17)--------------------CoalesceBatchesExec: target_batch_size=8192 +18)----------------------RepartitionExec: partitioning=Hash([col0@0], 4), input_partitions=4 +19)------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +20)--------------------------CoalesceBatchesExec: target_batch_size=8192 +21)----------------------------FilterExec: col0@0 IS NOT NULL +22)------------------------------MemoryExec: partitions=1, partition_sizes=[3] # Columns in the table are a,b,c,d. Source is CsvExec which is ordered by # a,b,c column. Column a has cardinality 2, column b has cardinality 4. diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 21fea4ad1025..f5865f99acfe 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -750,14 +750,18 @@ WHERE t1.a=t2.a; ---- logical_plan 01)Inner Join: t1.a = t2.a -02)--TableScan: t1 projection=[a, b] -03)--SubqueryAlias: t2 -04)----TableScan: t1 projection=[a, b] +02)--Filter: t1.a IS NOT NULL +03)----TableScan: t1 projection=[a, b] +04)--SubqueryAlias: t2 +05)----Filter: t1.a IS NOT NULL +06)------TableScan: t1 projection=[a, b] physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)] -03)----MemoryExec: partitions=1, partition_sizes=[1] -04)----MemoryExec: partitions=1, partition_sizes=[1] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------FilterExec: a@0 IS NOT NULL +05)--------MemoryExec: partitions=1, partition_sizes=[1] +06)----CoalesceBatchesExec: target_batch_size=8192 +07)------FilterExec: a@0 IS NOT NULL +08)--------MemoryExec: partitions=1, partition_sizes=[1] # Reset the configs to old values statement ok From dc876a9c85438902b65e9c4f7da6402f1c432e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 16:01:31 +0200 Subject: [PATCH 07/22] Wip --- datafusion/sqllogictest/test_files/joins.slt | 4 +- .../sqllogictest/test_files/predicates.slt | 12 ++-- .../sqllogictest/test_files/subquery.slt | 13 +++-- .../sqllogictest/test_files/tpch/q10.slt.part | 56 ++++++++++--------- 4 files changed, 46 insertions(+), 39 deletions(-) diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 7d0262952b31..d4dd6359f259 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1052,9 +1052,9 @@ WHERE join_t2.t2_id < 100 ---- logical_plan 01)Inner Join: join_t1.t1_id = join_t2.t2_id -02)--Filter: join_t1.t1_id < UInt32(100) +02)--Filter: join_t1.t1_id IS NOT NULL AND join_t1.t1_id < UInt32(100) 03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -04)--Filter: join_t2.t2_id < UInt32(100) +04)--Filter: join_t2.t2_id IS NOT NULL AND join_t2.t2_id < UInt32(100) 05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] # Reduce left join 2 (to inner join) diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 878d7c8a4dfb..646cd7e00f21 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -663,23 +663,23 @@ OR logical_plan 01)Projection: lineitem.l_partkey 02)--Inner Join: lineitem.l_partkey = part.p_partkey Filter: part.p_brand = Utf8("Brand#12") AND lineitem.l_quantity >= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= Decimal128(Some(1100),15,2) AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#23") AND lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= Decimal128(Some(2000),15,2) AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#34") AND lineitem.l_quantity >= Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= Decimal128(Some(3000),15,2) AND part.p_size <= Int32(15) -03)----Filter: lineitem.l_quantity >= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= Decimal128(Some(2000),15,2) OR lineitem.l_quantity >= Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= Decimal128(Some(3000),15,2) -04)------TableScan: lineitem projection=[l_partkey, l_quantity], partial_filters=[lineitem.l_quantity >= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= Decimal128(Some(2000),15,2) OR lineitem.l_quantity >= Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= Decimal128(Some(3000),15,2)] -05)----Filter: (part.p_brand = Utf8("Brand#12") AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#23") AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#34") AND part.p_size <= Int32(15)) AND part.p_size >= Int32(1) -06)------TableScan: part projection=[p_partkey, p_brand, p_size], partial_filters=[part.p_size >= Int32(1), part.p_brand = Utf8("Brand#12") AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#23") AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#34") AND part.p_size <= Int32(15)] +03)----Filter: (lineitem.l_quantity >= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= Decimal128(Some(2000),15,2) OR lineitem.l_quantity >= Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= Decimal128(Some(3000),15,2)) AND lineitem.l_partkey IS NOT NULL +04)------TableScan: lineitem projection=[l_partkey, l_quantity], partial_filters=[lineitem.l_quantity >= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= Decimal128(Some(2000),15,2) OR lineitem.l_quantity >= Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= Decimal128(Some(3000),15,2), lineitem.l_partkey IS NOT NULL] +05)----Filter: (part.p_brand = Utf8("Brand#12") AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#23") AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#34") AND part.p_size <= Int32(15)) AND part.p_partkey IS NOT NULL AND part.p_size >= Int32(1) +06)------TableScan: part projection=[p_partkey, p_brand, p_size], partial_filters=[part.p_size >= Int32(1), part.p_brand = Utf8("Brand#12") AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#23") AND part.p_size <= Int32(10) OR part.p_brand = Utf8("Brand#34") AND part.p_size <= Int32(15), part.p_partkey IS NOT NULL] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], filter=p_brand@1 = Brand#12 AND l_quantity@0 >= Some(100),15,2 AND l_quantity@0 <= Some(1100),15,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND l_quantity@0 >= Some(1000),15,2 AND l_quantity@0 <= Some(2000),15,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND l_quantity@0 >= Some(2000),15,2 AND l_quantity@0 <= Some(3000),15,2 AND p_size@2 <= 15, projection=[l_partkey@0] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------FilterExec: l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2 +06)----------FilterExec: (l_quantity@1 >= Some(100),15,2 AND l_quantity@1 <= Some(1100),15,2 OR l_quantity@1 >= Some(1000),15,2 AND l_quantity@1 <= Some(2000),15,2 OR l_quantity@1 >= Some(2000),15,2 AND l_quantity@1 <= Some(3000),15,2) AND l_partkey@0 IS NOT NULL 07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 08)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_quantity], has_header=true 09)----CoalesceBatchesExec: target_batch_size=8192 10)------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 11)--------CoalesceBatchesExec: target_batch_size=8192 -12)----------FilterExec: (p_brand@1 = Brand#12 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_size@2 <= 15) AND p_size@2 >= 1 +12)----------FilterExec: (p_brand@1 = Brand#12 AND p_size@2 <= 5 OR p_brand@1 = Brand#23 AND p_size@2 <= 10 OR p_brand@1 = Brand#34 AND p_size@2 <= 15) AND p_partkey@0 IS NOT NULL AND p_size@2 >= 1 13)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 14)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand, p_size], has_header=true diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 30b3631681e7..4469bebeeb84 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -198,7 +198,8 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: sum(t2.t2_int), t2.t2_id 06)--------Aggregate: groupBy=[[t2.t2_id]], aggr=[[sum(CAST(t2.t2_int AS Int64))]] -07)----------TableScan: t2 projection=[t2_id, t2_int] +07)----------Filter: t2.t2_id IS NOT NULL +08)------------TableScan: t2 projection=[t2_id, t2_int] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum] 02)--CoalesceBatchesExec: target_batch_size=2 @@ -208,10 +209,12 @@ physical_plan 06)----------CoalesceBatchesExec: target_batch_size=2 07)------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4 08)--------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)] -09)----------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] -10)------CoalesceBatchesExec: target_batch_size=2 -11)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 -12)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] +09)----------------CoalesceBatchesExec: target_batch_size=2 +10)------------------FilterExec: t2_id@0 IS NOT NULL +11)--------------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] +12)------CoalesceBatchesExec: target_batch_size=2 +13)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 +14)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] query II rowsort SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1 diff --git a/datafusion/sqllogictest/test_files/tpch/q10.slt.part b/datafusion/sqllogictest/test_files/tpch/q10.slt.part index 73593a470c9a..d9779da8e629 100644 --- a/datafusion/sqllogictest/test_files/tpch/q10.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q10.slt.part @@ -60,14 +60,16 @@ logical_plan 07)------------Inner Join: orders.o_orderkey = lineitem.l_orderkey 08)--------------Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_nationkey, customer.c_phone, customer.c_acctbal, customer.c_comment, orders.o_orderkey 09)----------------Inner Join: customer.c_custkey = orders.o_custkey -10)------------------TableScan: customer projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] -11)------------------Projection: orders.o_orderkey, orders.o_custkey -12)--------------------Filter: orders.o_orderdate >= Date32("1993-10-01") AND orders.o_orderdate < Date32("1994-01-01") -13)----------------------TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1993-10-01"), orders.o_orderdate < Date32("1994-01-01")] -14)--------------Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount -15)----------------Filter: lineitem.l_returnflag = Utf8("R") -16)------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], partial_filters=[lineitem.l_returnflag = Utf8("R")] -17)----------TableScan: nation projection=[n_nationkey, n_name] +10)------------------Filter: customer.c_nationkey IS NOT NULL AND customer.c_custkey IS NOT NULL +11)--------------------TableScan: customer projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], partial_filters=[customer.c_nationkey IS NOT NULL, customer.c_custkey IS NOT NULL] +12)------------------Projection: orders.o_orderkey, orders.o_custkey +13)--------------------Filter: orders.o_orderkey IS NOT NULL AND orders.o_custkey IS NOT NULL AND orders.o_orderdate >= Date32("1993-10-01") AND orders.o_orderdate < Date32("1994-01-01") +14)----------------------TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1993-10-01"), orders.o_orderdate < Date32("1994-01-01"), orders.o_orderkey IS NOT NULL, orders.o_custkey IS NOT NULL] +15)--------------Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount +16)----------------Filter: lineitem.l_orderkey IS NOT NULL AND lineitem.l_returnflag = Utf8("R") +17)------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], partial_filters=[lineitem.l_returnflag = Utf8("R"), lineitem.l_orderkey IS NOT NULL] +18)----------Filter: nation.n_nationkey IS NOT NULL +19)------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_nationkey IS NOT NULL] physical_plan 01)SortPreservingMergeExec: [revenue@2 DESC], fetch=10 02)--SortExec: TopK(fetch=10), expr=[revenue@2 DESC], preserve_partitioning=[true] @@ -88,24 +90,26 @@ physical_plan 17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7] 18)----------------------------------CoalesceBatchesExec: target_batch_size=8192 19)------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -20)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -21)----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], has_header=false -22)----------------------------------CoalesceBatchesExec: target_batch_size=8192 -23)------------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4 -24)--------------------------------------CoalesceBatchesExec: target_batch_size=8192 -25)----------------------------------------FilterExec: o_orderdate@2 >= 1993-10-01 AND o_orderdate@2 < 1994-01-01, projection=[o_orderkey@0, o_custkey@1] -26)------------------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], has_header=false -27)--------------------------CoalesceBatchesExec: target_batch_size=8192 -28)----------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -29)------------------------------CoalesceBatchesExec: target_batch_size=8192 -30)--------------------------------FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] -31)----------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], has_header=false -32)------------------CoalesceBatchesExec: target_batch_size=8192 -33)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -34)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -35)------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false - - +20)--------------------------------------CoalesceBatchesExec: target_batch_size=8192 +21)----------------------------------------FilterExec: c_nationkey@3 IS NOT NULL AND c_custkey@0 IS NOT NULL +22)------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +23)--------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], has_header=false +24)----------------------------------CoalesceBatchesExec: target_batch_size=8192 +25)------------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4 +26)--------------------------------------CoalesceBatchesExec: target_batch_size=8192 +27)----------------------------------------FilterExec: o_orderkey@0 IS NOT NULL AND o_custkey@1 IS NOT NULL AND o_orderdate@2 >= 1993-10-01 AND o_orderdate@2 < 1994-01-01, projection=[o_orderkey@0, o_custkey@1] +28)------------------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], has_header=false +29)--------------------------CoalesceBatchesExec: target_batch_size=8192 +30)----------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +31)------------------------------CoalesceBatchesExec: target_batch_size=8192 +32)--------------------------------FilterExec: l_orderkey@0 IS NOT NULL AND l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] +33)----------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], has_header=false +34)------------------CoalesceBatchesExec: target_batch_size=8192 +35)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +36)----------------------CoalesceBatchesExec: target_batch_size=8192 +37)------------------------FilterExec: n_nationkey@0 IS NOT NULL +38)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +39)----------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false query ITRRTTTT select From 5132ff84915e539edd3fea388cd1cdbec41d43dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 16:15:05 +0200 Subject: [PATCH 08/22] WIP --- .../optimizer/tests/optimizer_integration.rs | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index 5292b66197f6..da5e92eafd11 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -177,15 +177,12 @@ fn intersect() -> Result<()> { let plan = test_sql(sql)?; let expected = "LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ - \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ - \n LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ - \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ - \n Filter: test.col_int32 IS NOT NULL AND test.col_utf8 IS NOT NULL\ - \n TableScan: test projection=[col_int32, col_utf8]\ - \n Filter: test.col_int32 IS NOT NULL AND test.col_utf8 IS NOT NULL\ - \n TableScan: test projection=[col_int32, col_utf8]\ - \n Filter: test.col_int32 IS NOT NULL AND test.col_utf8 IS NOT NULL\ - \n TableScan: test projection=[col_int32, col_utf8]"; + \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ + \n LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ + \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ + \n TableScan: test projection=[col_int32, col_utf8]\ + \n TableScan: test projection=[col_int32, col_utf8]\ + \n TableScan: test projection=[col_int32, col_utf8]"; assert_eq!(expected, format!("{plan}")); Ok(()) } @@ -281,11 +278,9 @@ fn test_same_name_but_not_ambiguous() { let expected = "LeftSemi Join: t1.col_int32 = t2.col_int32\ \n Aggregate: groupBy=[[t1.col_int32]], aggr=[[]]\ \n SubqueryAlias: t1\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32]\ + \n TableScan: test projection=[col_int32]\ \n SubqueryAlias: t2\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32]"; + \n TableScan: test projection=[col_int32]"; assert_eq!(expected, format!("{plan}")); } From 245fc11a3a3d57e0cd6a2cf2e3d16a631b0fe216 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 16:26:29 +0200 Subject: [PATCH 09/22] Add constraints --- .../test_files/tpch/create_tables.slt.part | 138 +++++++++--------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part index d6249cb57990..92507aaf947f 100644 --- a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part @@ -23,102 +23,102 @@ statement ok CREATE EXTERNAL TABLE IF NOT EXISTS supplier ( - s_suppkey BIGINT, - s_name VARCHAR, - s_address VARCHAR, - s_nationkey BIGINT, - s_phone VARCHAR, - s_acctbal DECIMAL(15, 2), - s_comment VARCHAR, - s_rev VARCHAR, + s_suppkey BIGINT NOT NULL, + s_name VARCHAR NOT NULL, + s_address VARCHAR NOT NULL, + s_nationkey BIGINT NOT NULL, + s_phone VARCHAR NOT NULL, + s_acctbal DECIMAL(15, 2) NOT NULL, + s_comment VARCHAR NOT NULL, + NOT NULL VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/supplier.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS part ( - p_partkey BIGINT, - p_name VARCHAR, - p_mfgr VARCHAR, - p_brand VARCHAR, - p_type VARCHAR, - p_size INTEGER, - p_container VARCHAR, - p_retailprice DECIMAL(15, 2), - p_comment VARCHAR, - p_rev VARCHAR, + p_partkey BIGINT NOT NULL, + p_name VARCHAR NOT NULL, + p_mfgr VARCHAR NOT NULL, + p_brand VARCHAR NOT NULL, + p_type VARCHAR NOT NULL, + p_size INTEGER NOT NULL, + p_container VARCHAR NOT NULL, + p_retailprice DECIMAL(15, 2) NOT NULL, + p_comment VARCHAR NOT NULL, + p_rev VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/part.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS partsupp ( - ps_partkey BIGINT, - ps_suppkey BIGINT, - ps_availqty INTEGER, - ps_supplycost DECIMAL(15, 2), - ps_comment VARCHAR, - ps_rev VARCHAR, + ps_partkey BIGINT NOT NULL, + ps_suppkey BIGINT NOT NULL, + ps_availqty INTEGER NOT NULL, + ps_supplycost DECIMAL(15, 2) NOT NULL, + ps_comment VARCHAR NOT NULL, + ps_rev VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/partsupp.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS customer ( - c_custkey BIGINT, - c_name VARCHAR, - c_address VARCHAR, - c_nationkey BIGINT, - c_phone VARCHAR, - c_acctbal DECIMAL(15, 2), - c_mktsegment VARCHAR, - c_comment VARCHAR, - c_rev VARCHAR, + c_custkey BIGINT NOT NULL, + c_name VARCHAR NOT NULL, + c_address VARCHAR NOT NULL, + c_nationkey BIGINT NOT NULL, + c_phone VARCHAR NOT NULL, + c_acctbal DECIMAL(15, 2) NOT NULL, + c_mktsegment VARCHAR NOT NULL, + c_comment VARCHAR NOT NULL, + c_rev VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/customer.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS orders ( - o_orderkey BIGINT, - o_custkey BIGINT, - o_orderstatus VARCHAR, - o_totalprice DECIMAL(15, 2), - o_orderdate DATE, - o_orderpriority VARCHAR, - o_clerk VARCHAR, - o_shippriority INTEGER, - o_comment VARCHAR, - o_rev VARCHAR, + o_orderkey BIGINT NOT NULL, + o_custkey BIGINT NOT NULL, + o_orderstatus VARCHAR NOT NULL, + o_totalprice DECIMAL(15, 2) NOT NULL, + o_orderdate DATE NOT NULL, + o_orderpriority VARCHAR NOT NULL, + o_clerk VARCHAR NOT NULL, + o_shippriority INTEGER NOT NULL, + o_comment VARCHAR NOT NULL, + o_rev VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/orders.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( - l_orderkey BIGINT, - l_partkey BIGINT, - l_suppkey BIGINT, - l_linenumber INTEGER, - l_quantity DECIMAL(15, 2), - l_extendedprice DECIMAL(15, 2), - l_discount DECIMAL(15, 2), - l_tax DECIMAL(15, 2), - l_returnflag VARCHAR, - l_linestatus VARCHAR, - l_shipdate DATE, - l_commitdate DATE, - l_receiptdate DATE, - l_shipinstruct VARCHAR, - l_shipmode VARCHAR, - l_comment VARCHAR, - l_rev VARCHAR, + l_orderkey BIGINT NOT NULL, + l_partkey BIGINT NOT NULL, + l_suppkey BIGINT NOT NULL, + l_linenumber INTEGER NOT NULL, + l_quantity DECIMAL(15, 2) NOT NULL, + l_extendedprice DECIMAL(15, 2) NOT NULL, + l_discount DECIMAL(15, 2) NOT NULL, + l_tax DECIMAL(15, 2) NOT NULL, + l_returnflag VARCHAR NOT NULL, + l_linestatus VARCHAR NOT NULL, + l_shipdate DATE NOT NULL, + l_commitdate DATE NOT NULL, + l_receiptdate DATE NOT NULL, + l_shipinstruct VARCHAR NOT NULL, + l_shipmode VARCHAR NOT NULL, + l_comment VARCHAR NOT NULL, + l_rev VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/lineitem.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS nation ( - n_nationkey BIGINT, - n_name VARCHAR, - n_regionkey BIGINT, - n_comment VARCHAR, - n_rev VARCHAR, + n_nationkey BIGINT NOT NULL, + n_name VARCHAR NOT NULL, + n_regionkey BIGINT NOT NULL, + n_comment VARCHAR NOT NULL, + n_rev VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/nation.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS region ( - r_regionkey BIGINT, - r_name VARCHAR, - r_comment VARCHAR, - r_rev VARCHAR, + r_regionkey BIGINT NOT NULL, + r_name VARCHAR NOT NULL, + r_comment VARCHAR NOT NULL, + r_rev VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); From 6b83f1ac54ae5f9fd41b193aaeca2f15043d2fb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 16:42:41 +0200 Subject: [PATCH 10/22] test failures --- .../sqllogictest/test_files/group_by.slt | 7 ++ datafusion/sqllogictest/test_files/join.slt | 2 + datafusion/sqllogictest/test_files/joins.slt | 7 +- .../sqllogictest/test_files/predicates.slt | 39 ++++++---- .../test_files/sort_merge_join.slt | 24 ++++-- .../sqllogictest/test_files/subquery.slt | 14 ++-- .../sqllogictest/test_files/tpch/q10.slt.part | 73 ++++++------------- 7 files changed, 86 insertions(+), 80 deletions(-) diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 9eb7129027a7..4f04b75de1da 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2015,6 +2015,13 @@ logical_plan 09)----------Filter: tab0.col0 IS NOT NULL 10)------------TableScan: tab0 projection=[col0, col1] physical_plan +01)SortPreservingMergeExec: [col0@0 ASC NULLS LAST] +02)--SortExec: expr=[col0@0 ASC NULLS LAST], preserve_partitioning=[true] +03)----ProjectionExec: expr=[col0@0 as col0, last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]@3 as last_col1] +04)------AggregateExec: mode=FinalPartitioned, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]] +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------RepartitionExec: partitioning=Hash([col0@0, col1@1, col2@2], 4), input_partitions=4 +07)------------AggregateExec: mode=Partial, gby=[col0@0 as col0, col1@1 as col1, col2@2 as col2], aggr=[last_value(r.col1) ORDER BY [r.col0 ASC NULLS LAST]] 08)--------------ProjectionExec: expr=[col0@2 as col0, col1@3 as col1, col2@4 as col2, col0@0 as col0, col1@1 as col1] 09)----------------CoalesceBatchesExec: target_batch_size=8192 10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(col0@0, col0@0)] diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index f5865f99acfe..1c43c63ddbdf 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -756,6 +756,8 @@ logical_plan 05)----Filter: t1.a IS NOT NULL 06)------TableScan: t1 projection=[a, b] physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: a@0 IS NOT NULL 05)--------MemoryExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index d4dd6359f259..2b6444da3663 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1068,9 +1068,10 @@ WHERE join_t2.t2_int < 10 or (join_t1.t1_int > 2 and join_t2.t2_name != 'w') ---- logical_plan 01)Inner Join: join_t1.t1_id = join_t2.t2_id Filter: join_t2.t2_int < UInt32(10) OR join_t1.t1_int > UInt32(2) AND join_t2.t2_name != Utf8("w") -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--Filter: join_t2.t2_int < UInt32(10) OR join_t2.t2_name != Utf8("w") -04)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +02)--Filter: join_t1.t1_id IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--Filter: (join_t2.t2_int < UInt32(10) OR join_t2.t2_name != Utf8("w")) AND join_t2.t2_id IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] # Reduce left join 3 (to inner join) diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 646cd7e00f21..2edd71d88ff1 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -753,11 +753,13 @@ logical_plan 03)----Inner Join: part.p_partkey = partsupp.ps_partkey 04)------Projection: lineitem.l_extendedprice, lineitem.l_discount, part.p_partkey 05)--------Inner Join: lineitem.l_partkey = part.p_partkey -06)----------TableScan: lineitem projection=[l_partkey, l_extendedprice, l_discount] -07)----------Projection: part.p_partkey -08)------------Filter: part.p_brand = Utf8("Brand#12") OR part.p_brand = Utf8("Brand#23") -09)--------------TableScan: part projection=[p_partkey, p_brand], partial_filters=[part.p_brand = Utf8("Brand#12") OR part.p_brand = Utf8("Brand#23")] -10)------TableScan: partsupp projection=[ps_partkey, ps_suppkey] +06)----------Filter: lineitem.l_partkey IS NOT NULL +07)------------TableScan: lineitem projection=[l_partkey, l_extendedprice, l_discount], partial_filters=[lineitem.l_partkey IS NOT NULL] +08)----------Projection: part.p_partkey +09)------------Filter: part.p_partkey IS NOT NULL AND (part.p_brand = Utf8("Brand#12") OR part.p_brand = Utf8("Brand#23")) +10)--------------TableScan: part projection=[p_partkey, p_brand], partial_filters=[part.p_brand = Utf8("Brand#12") OR part.p_brand = Utf8("Brand#23"), part.p_partkey IS NOT NULL] +11)------Filter: partsupp.ps_partkey IS NOT NULL +12)--------TableScan: partsupp projection=[ps_partkey, ps_suppkey] physical_plan 01)AggregateExec: mode=SinglePartitioned, gby=[p_partkey@2 as p_partkey], aggr=[sum(lineitem.l_extendedprice), avg(lineitem.l_discount), count(DISTINCT partsupp.ps_suppkey)] 02)--CoalesceBatchesExec: target_batch_size=8192 @@ -766,17 +768,22 @@ physical_plan 05)--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_partkey@3] 06)----------CoalesceBatchesExec: target_batch_size=8192 07)------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_extendedprice, l_discount], has_header=true -10)----------CoalesceBatchesExec: target_batch_size=8192 -11)------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -12)--------------CoalesceBatchesExec: target_batch_size=8192 -13)----------------FilterExec: p_brand@1 = Brand#12 OR p_brand@1 = Brand#23, projection=[p_partkey@0] -14)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -15)--------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand], has_header=true -16)------CoalesceBatchesExec: target_batch_size=8192 -17)--------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=1 -18)----------MemoryExec: partitions=1, partition_sizes=[1] +08)--------------CoalesceBatchesExec: target_batch_size=8192 +09)----------------FilterExec: l_partkey@0 IS NOT NULL +10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +11)--------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/lineitem.csv]]}, projection=[l_partkey, l_extendedprice, l_discount], has_header=true +12)----------CoalesceBatchesExec: target_batch_size=8192 +13)------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +14)--------------CoalesceBatchesExec: target_batch_size=8192 +15)----------------FilterExec: p_partkey@0 IS NOT NULL AND (p_brand@1 = Brand#12 OR p_brand@1 = Brand#23), projection=[p_partkey@0] +16)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +17)--------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/tpch-csv/part.csv]]}, projection=[p_partkey, p_brand], has_header=true +18)------CoalesceBatchesExec: target_batch_size=8192 +19)--------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 +20)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +21)------------CoalesceBatchesExec: target_batch_size=8192 +22)--------------FilterExec: ps_partkey@0 IS NOT NULL +23)----------------MemoryExec: partitions=1, partition_sizes=[1] # Inlist simplification diff --git a/datafusion/sqllogictest/test_files/sort_merge_join.slt b/datafusion/sqllogictest/test_files/sort_merge_join.slt index ebd53e9690fc..e9b4c73e492b 100644 --- a/datafusion/sqllogictest/test_files/sort_merge_join.slt +++ b/datafusion/sqllogictest/test_files/sort_merge_join.slt @@ -34,14 +34,26 @@ EXPLAIN SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b * ---- logical_plan 01)Inner Join: t1.a = t2.a Filter: CAST(t2.b AS Int64) * Int64(50) <= CAST(t1.b AS Int64) -02)--TableScan: t1 projection=[a, b] -03)--TableScan: t2 projection=[a, b] +02)--Filter: t1.a IS NOT NULL +03)----TableScan: t1 projection=[a, b] +04)--Filter: t2.a IS NOT NULL +05)----TableScan: t2 projection=[a, b] physical_plan 01)SortMergeJoin: join_type=Inner, on=[(a@0, a@0)], filter=CAST(b@1 AS Int64) * 50 <= CAST(b@0 AS Int64) -02)--SortExec: expr=[a@0 ASC], preserve_partitioning=[false] -03)----MemoryExec: partitions=1, partition_sizes=[1] -04)--SortExec: expr=[a@0 ASC], preserve_partitioning=[false] -05)----MemoryExec: partitions=1, partition_sizes=[1] +02)--SortExec: expr=[a@0 ASC], preserve_partitioning=[true] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------FilterExec: a@0 IS NOT NULL +08)--------------MemoryExec: partitions=1, partition_sizes=[1] +09)--SortExec: expr=[a@0 ASC], preserve_partitioning=[true] +10)----CoalesceBatchesExec: target_batch_size=8192 +11)------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 +12)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +13)----------CoalesceBatchesExec: target_batch_size=8192 +14)------------FilterExec: a@0 IS NOT NULL +15)--------------MemoryExec: partitions=1, partition_sizes=[1] # inner join with join filter query TITI rowsort diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 4469bebeeb84..3e907ad88251 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -235,7 +235,8 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: sum(t2.t2_int * Float64(1)) + Float64(1) AS sum(t2.t2_int * Float64(1)) + Int64(1), t2.t2_id 06)--------Aggregate: groupBy=[[t2.t2_id]], aggr=[[sum(CAST(t2.t2_int AS Float64)) AS sum(t2.t2_int * Float64(1))]] -07)----------TableScan: t2 projection=[t2_id, t2_int] +07)----------Filter: t2.t2_id IS NOT NULL +08)------------TableScan: t2 projection=[t2_id, t2_int] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int * Float64(1)) + Int64(1)@0 as t2_sum] 02)--CoalesceBatchesExec: target_batch_size=2 @@ -245,11 +246,12 @@ physical_plan 06)----------CoalesceBatchesExec: target_batch_size=2 07)------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4 08)--------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int * Float64(1))] -09)----------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] -10)------CoalesceBatchesExec: target_batch_size=2 -11)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 -12)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] - +09)----------------CoalesceBatchesExec: target_batch_size=2 +10)------------------FilterExec: t2_id@0 IS NOT NULL +11)--------------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] +12)------CoalesceBatchesExec: target_batch_size=2 +13)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 +14)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] query IR rowsort SELECT t1_id, (SELECT sum(t2_int * 1.0) + 1 FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1 ---- diff --git a/datafusion/sqllogictest/test_files/tpch/q10.slt.part b/datafusion/sqllogictest/test_files/tpch/q10.slt.part index d9779da8e629..aed5e055e4db 100644 --- a/datafusion/sqllogictest/test_files/tpch/q10.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q10.slt.part @@ -60,56 +60,31 @@ logical_plan 07)------------Inner Join: orders.o_orderkey = lineitem.l_orderkey 08)--------------Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_nationkey, customer.c_phone, customer.c_acctbal, customer.c_comment, orders.o_orderkey 09)----------------Inner Join: customer.c_custkey = orders.o_custkey -10)------------------Filter: customer.c_nationkey IS NOT NULL AND customer.c_custkey IS NOT NULL -11)--------------------TableScan: customer projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], partial_filters=[customer.c_nationkey IS NOT NULL, customer.c_custkey IS NOT NULL] -12)------------------Projection: orders.o_orderkey, orders.o_custkey -13)--------------------Filter: orders.o_orderkey IS NOT NULL AND orders.o_custkey IS NOT NULL AND orders.o_orderdate >= Date32("1993-10-01") AND orders.o_orderdate < Date32("1994-01-01") -14)----------------------TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1993-10-01"), orders.o_orderdate < Date32("1994-01-01"), orders.o_orderkey IS NOT NULL, orders.o_custkey IS NOT NULL] -15)--------------Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount -16)----------------Filter: lineitem.l_orderkey IS NOT NULL AND lineitem.l_returnflag = Utf8("R") -17)------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], partial_filters=[lineitem.l_returnflag = Utf8("R"), lineitem.l_orderkey IS NOT NULL] -18)----------Filter: nation.n_nationkey IS NOT NULL -19)------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_nationkey IS NOT NULL] +10)------------------TableScan: customer projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] +11)------------------Projection: orders.o_orderkey, orders.o_custkey +12)--------------------Filter: orders.o_orderdate >= Date32("1993-10-01") AND orders.o_orderdate < Date32("1994-01-01") +13)----------------------TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1993-10-01"), orders.o_orderdate < Date32("1994-01-01")] +14)--------------Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount +15)----------------Filter: lineitem.l_returnflag = Utf8("R") +16)------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], partial_filters=[lineitem.l_returnflag = Utf8("R")] +17)----------TableScan: nation projection=[n_nationkey, n_name] physical_plan -01)SortPreservingMergeExec: [revenue@2 DESC], fetch=10 -02)--SortExec: TopK(fetch=10), expr=[revenue@2 DESC], preserve_partitioning=[true] -03)----ProjectionExec: expr=[c_custkey@0 as c_custkey, c_name@1 as c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@7 as revenue, c_acctbal@2 as c_acctbal, n_name@4 as n_name, c_address@5 as c_address, c_phone@3 as c_phone, c_comment@6 as c_comment] -04)------AggregateExec: mode=FinalPartitioned, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@2 as c_acctbal, c_phone@3 as c_phone, n_name@4 as n_name, c_address@5 as c_address, c_comment@6 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 4), input_partitions=4 -07)------------AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -08)--------------CoalesceBatchesExec: target_batch_size=8192 -09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@7, l_discount@8, n_name@10] -10)------------------CoalesceBatchesExec: target_batch_size=8192 -11)--------------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4 -12)----------------------CoalesceBatchesExec: target_batch_size=8192 -13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10] -14)--------------------------CoalesceBatchesExec: target_batch_size=8192 -15)----------------------------RepartitionExec: partitioning=Hash([o_orderkey@7], 4), input_partitions=4 -16)------------------------------CoalesceBatchesExec: target_batch_size=8192 -17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7] -18)----------------------------------CoalesceBatchesExec: target_batch_size=8192 -19)------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 -20)--------------------------------------CoalesceBatchesExec: target_batch_size=8192 -21)----------------------------------------FilterExec: c_nationkey@3 IS NOT NULL AND c_custkey@0 IS NOT NULL -22)------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -23)--------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], has_header=false -24)----------------------------------CoalesceBatchesExec: target_batch_size=8192 -25)------------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4 -26)--------------------------------------CoalesceBatchesExec: target_batch_size=8192 -27)----------------------------------------FilterExec: o_orderkey@0 IS NOT NULL AND o_custkey@1 IS NOT NULL AND o_orderdate@2 >= 1993-10-01 AND o_orderdate@2 < 1994-01-01, projection=[o_orderkey@0, o_custkey@1] -28)------------------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], has_header=false -29)--------------------------CoalesceBatchesExec: target_batch_size=8192 -30)----------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 -31)------------------------------CoalesceBatchesExec: target_batch_size=8192 -32)--------------------------------FilterExec: l_orderkey@0 IS NOT NULL AND l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] -33)----------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], has_header=false -34)------------------CoalesceBatchesExec: target_batch_size=8192 -35)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -36)----------------------CoalesceBatchesExec: target_batch_size=8192 -37)------------------------FilterExec: n_nationkey@0 IS NOT NULL -38)--------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -39)----------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false +20)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +21)----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], has_header=false +22)----------------------------------CoalesceBatchesExec: target_batch_size=8192 +23)------------------------------------RepartitionExec: partitioning=Hash([o_custkey@1], 4), input_partitions=4 +24)--------------------------------------CoalesceBatchesExec: target_batch_size=8192 +25)----------------------------------------FilterExec: o_orderdate@2 >= 1993-10-01 AND o_orderdate@2 < 1994-01-01, projection=[o_orderkey@0, o_custkey@1] +26)------------------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:0..4223281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:4223281..8446562], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:8446562..12669843], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/orders.tbl:12669843..16893122]]}, projection=[o_orderkey, o_custkey, o_orderdate], has_header=false +27)--------------------------CoalesceBatchesExec: target_batch_size=8192 +28)----------------------------RepartitionExec: partitioning=Hash([l_orderkey@0], 4), input_partitions=4 +29)------------------------------CoalesceBatchesExec: target_batch_size=8192 +30)--------------------------------FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] +31)----------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], has_header=false +32)------------------CoalesceBatchesExec: target_batch_size=8192 +33)--------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +34)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +35)------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false query ITRRTTTT select From 8fa7295bcf650aa52afd8e928b24a25ea2769659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 17:16:36 +0200 Subject: [PATCH 11/22] Wip --- .../sqllogictest/test_files/group_by.slt | 16 +++++++++++----- datafusion/sqllogictest/test_files/join.slt | 19 +++++++++++-------- datafusion/sqllogictest/test_files/joins.slt | 7 ++++--- .../sqllogictest/test_files/subquery.slt | 1 + 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 4f04b75de1da..1df9736df6cd 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2879,9 +2879,11 @@ logical_plan 04)------Projection: s.zip_code, s.country, s.sn, s.ts, s.currency, e.sn, e.amount 05)--------Inner Join: s.currency = e.currency Filter: s.ts >= e.ts 06)----------SubqueryAlias: s -07)------------TableScan: sales_global projection=[zip_code, country, sn, ts, currency] -08)----------SubqueryAlias: e -09)------------TableScan: sales_global projection=[sn, ts, currency, amount] +07)------------Filter: sales_global.currency IS NOT NULL +08)--------------TableScan: sales_global projection=[zip_code, country, sn, ts, currency] +09)----------SubqueryAlias: e +10)------------Filter: sales_global.currency IS NOT NULL +11)--------------TableScan: sales_global projection=[sn, ts, currency, amount] physical_plan 01)SortExec: expr=[sn@2 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[zip_code@1 as zip_code, country@2 as country, sn@0 as sn, ts@3 as ts, currency@4 as currency, last_value(e.amount) ORDER BY [e.sn ASC NULLS LAST]@5 as last_rate] @@ -2889,8 +2891,12 @@ physical_plan 04)------ProjectionExec: expr=[zip_code@2 as zip_code, country@3 as country, sn@4 as sn, ts@5 as ts, currency@6 as currency, sn@0 as sn, amount@1 as amount] 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(currency@2, currency@4)], filter=ts@0 >= ts@1, projection=[sn@0, amount@3, zip_code@4, country@5, sn@6, ts@7, currency@8] -07)------------MemoryExec: partitions=1, partition_sizes=[1] -08)------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=8192 +08)--------------FilterExec: currency@2 IS NOT NULL +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)------------CoalesceBatchesExec: target_batch_size=8192 +11)--------------FilterExec: currency@4 IS NOT NULL +12)----------------MemoryExec: partitions=1, partition_sizes=[1] query ITIPTR rowsort SELECT s.zip_code, s.country, s.sn, s.ts, s.currency, LAST_VALUE(e.amount ORDER BY e.sn) AS last_rate diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 1c43c63ddbdf..83a161bc3b8e 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -965,17 +965,20 @@ logical_plan 05)--------Filter: employees.name = Utf8("Alice") OR employees.name != Utf8("Alice") AND employees.name = Utf8("Carol") 06)----------TableScan: employees projection=[emp_id, name] 07)------SubqueryAlias: d -08)--------TableScan: department projection=[emp_id, dept_name] +08)--------Filter: department.emp_id IS NOT NULL +09)----------TableScan: department projection=[emp_id, dept_name] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: dept_name@2 != Engineering AND name@1 = Alice OR name@1 != Alice AND name@1 = Carol -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------CoalesceBatchesExec: target_batch_size=8192 -05)--------HashJoinExec: mode=CollectLeft, join_type=Left, on=[(emp_id@0, emp_id@0)], projection=[emp_id@0, name@1, dept_name@3] -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: name@1 = Alice OR name@1 != Alice AND name@1 = Carol -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)----------MemoryExec: partitions=1, partition_sizes=[1] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------HashJoinExec: mode=CollectLeft, join_type=Left, on=[(emp_id@0, emp_id@0)], projection=[emp_id@0, name@1, dept_name@3] +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------FilterExec: name@1 = Alice OR name@1 != Alice AND name@1 = Carol +07)------------MemoryExec: partitions=1, partition_sizes=[1] +08)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------CoalesceBatchesExec: target_batch_size=8192 +10)------------FilterExec: emp_id@0 IS NOT NULL +11)--------------MemoryExec: partitions=1, partition_sizes=[1] query ITT SELECT e.emp_id, e.name, d.dept_name diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 2b6444da3663..d9e901b1eeca 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1092,12 +1092,13 @@ logical_plan 02)--SubqueryAlias: t3 03)----Projection: join_t1.t1_id, join_t1.t1_name, join_t1.t1_int 04)------Inner Join: join_t1.t1_id = join_t2.t2_id -05)--------Filter: join_t1.t1_id < UInt32(100) +05)--------Filter: join_t1.t1_id IS NOT NULL AND join_t1.t1_id < UInt32(100) 06)----------TableScan: join_t1 projection=[t1_id, t1_name, t1_int] 07)--------Projection: join_t2.t2_id -08)----------Filter: join_t2.t2_int < UInt32(3) AND join_t2.t2_id < UInt32(100) +08)----------Filter: join_t2.t2_id IS NOT NULL AND join_t2.t2_int < UInt32(3) AND join_t2.t2_id < UInt32(100) 09)------------TableScan: join_t2 projection=[t2_id, t2_int] -10)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +10)--Filter: join_t2.t2_int IS NOT NULL +11)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] # Reduce right join 1 (to inner join) diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 3e907ad88251..aa39e796cc15 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -252,6 +252,7 @@ physical_plan 12)------CoalesceBatchesExec: target_batch_size=2 13)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 14)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] + query IR rowsort SELECT t1_id, (SELECT sum(t2_int * 1.0) + 1 FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1 ---- From 0eca1290615a0acadb0d55bade1fe59e9ca3c974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 17:29:27 +0200 Subject: [PATCH 12/22] Wip --- .../sqllogictest/test_files/group_by.slt | 24 +++++++++------ datafusion/sqllogictest/test_files/join.slt | 30 ++++++++++++------- datafusion/sqllogictest/test_files/joins.slt | 24 +++++++++------ 3 files changed, 49 insertions(+), 29 deletions(-) diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 1df9736df6cd..9de34a763e31 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -3881,20 +3881,26 @@ logical_plan 05)--------Projection: l.a, l.d, row_n 06)----------Inner Join: l.d = r.d Filter: CAST(l.a AS Int64) >= CAST(r.a AS Int64) - Int64(10) 07)------------SubqueryAlias: l -08)--------------TableScan: multiple_ordered_table projection=[a, d] -09)------------Projection: r.a, r.d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_n -10)--------------WindowAggr: windowExpr=[[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] -11)----------------SubqueryAlias: r -12)------------------TableScan: multiple_ordered_table projection=[a, d] +08)--------------Filter: multiple_ordered_table.d IS NOT NULL +09)----------------TableScan: multiple_ordered_table projection=[a, d], partial_filters=[multiple_ordered_table.d IS NOT NULL] +10)------------Projection: r.a, r.d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_n +11)--------------Filter: r.d IS NOT NULL +12)----------------WindowAggr: windowExpr=[[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +13)------------------SubqueryAlias: r +14)--------------------TableScan: multiple_ordered_table projection=[a, d] physical_plan 01)ProjectionExec: expr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]@1 as amount_usd] 02)--AggregateExec: mode=Single, gby=[row_n@2 as row_n], aggr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]], ordering_mode=Sorted 03)----CoalesceBatchesExec: target_batch_size=2 04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4] -05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true -06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n] -07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] -08)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: d@1 IS NOT NULL +07)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true +08)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n] +09)----------CoalesceBatchesExec: target_batch_size=2 +10)------------FilterExec: d@1 IS NOT NULL +11)--------------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] +12)----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true # reset partition number to 8. statement ok diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 83a161bc3b8e..d15962eb6181 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -1162,22 +1162,30 @@ logical_plan 02)--Inner Join: CAST(t1.v0 AS Float64) = t0.v1 Filter: t0.v1 + CAST(t5.v0 AS Float64) > Float64(0) 03)----Projection: t1.v0, t1.v1, t5.v0, t5.v2, t5.v3, t5.v4 04)------Inner Join: Using t1.v0 = t5.v0, t1.v1 = t5.v1 -05)--------TableScan: t1 projection=[v0, v1] -06)--------TableScan: t5 projection=[v0, v1, v2, v3, v4] -07)----TableScan: t0 projection=[v0, v1] +05)--------Filter: CAST(t1.v0 AS Float64) IS NOT NULL AND t1.v0 IS NOT NULL AND t1.v1 IS NOT NULL +06)----------TableScan: t1 projection=[v0, v1] +07)--------Filter: CAST(t5.v0 AS Float64) IS NOT NULL AND t5.v0 IS NOT NULL AND t5.v1 IS NOT NULL +08)----------TableScan: t5 projection=[v0, v1, v2, v3, v4] +09)----Filter: t0.v1 IS NOT NULL +10)------TableScan: t0 projection=[v0, v1] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(CAST(t1.v0 AS Float64)@6, v1@1)], filter=v1@1 + CAST(v0@0 AS Float64) > 0, projection=[v0@0, v1@1, v2@3, v3@4, v4@5, v0@7, v1@8] 03)----CoalescePartitionsExec 04)------ProjectionExec: expr=[v0@0 as v0, v1@1 as v1, v0@2 as v0, v2@3 as v2, v3@4 as v3, v4@5 as v4, CAST(v0@0 AS Float64) as CAST(t1.v0 AS Float64)] -05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0), (v1@1, v1@1)], projection=[v0@0, v1@1, v0@2, v2@4, v3@5, v4@6] -08)--------------MemoryExec: partitions=1, partition_sizes=[0] -09)--------------MemoryExec: partitions=1, partition_sizes=[0] -10)----MemoryExec: partitions=1, partition_sizes=[0] - - +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0), (v1@1, v1@1)], projection=[v0@0, v1@1, v0@2, v2@4, v3@5, v4@6] +07)------------CoalesceBatchesExec: target_batch_size=8192 +08)--------------FilterExec: CAST(v0@0 AS Float64) IS NOT NULL AND v0@0 IS NOT NULL AND v1@1 IS NOT NULL +09)----------------MemoryExec: partitions=1, partition_sizes=[0] +10)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +11)--------------CoalesceBatchesExec: target_batch_size=8192 +12)----------------FilterExec: CAST(v0@0 AS Float64) IS NOT NULL AND v0@0 IS NOT NULL AND v1@1 IS NOT NULL +13)------------------MemoryExec: partitions=1, partition_sizes=[0] +14)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +15)------CoalesceBatchesExec: target_batch_size=8192 +16)--------FilterExec: v1@1 IS NOT NULL +17)----------MemoryExec: partitions=1, partition_sizes=[0] statement ok drop table t5; diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index d9e901b1eeca..5ce67278d2ba 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3457,20 +3457,26 @@ logical_plan 05)--------Projection: l.a, l.d, row_n 06)----------Inner Join: l.d = r.d Filter: CAST(l.a AS Int64) >= CAST(r.a AS Int64) - Int64(10) 07)------------SubqueryAlias: l -08)--------------TableScan: multiple_ordered_table projection=[a, d] -09)------------Projection: r.a, r.d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_n -10)--------------WindowAggr: windowExpr=[[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] -11)----------------SubqueryAlias: r -12)------------------TableScan: multiple_ordered_table projection=[a, d] +08)--------------Filter: multiple_ordered_table.d IS NOT NULL +09)----------------TableScan: multiple_ordered_table projection=[a, d], partial_filters=[multiple_ordered_table.d IS NOT NULL] +10)------------Projection: r.a, r.d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS row_n +11)--------------Filter: r.d IS NOT NULL +12)----------------WindowAggr: windowExpr=[[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +13)------------------SubqueryAlias: r +14)--------------------TableScan: multiple_ordered_table projection=[a, d] physical_plan 01)ProjectionExec: expr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]@1 as amount_usd] 02)--AggregateExec: mode=Single, gby=[row_n@2 as row_n], aggr=[last_value(l.d) ORDER BY [l.a ASC NULLS LAST]], ordering_mode=Sorted 03)----CoalesceBatchesExec: target_batch_size=2 04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4] -05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true -06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n] -07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] -08)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: d@1 IS NOT NULL +07)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true +08)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n] +09)----------CoalesceBatchesExec: target_batch_size=2 +10)------------FilterExec: d@1 IS NOT NULL +11)--------------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] +12)----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true # run query above in multiple partitions statement ok From 419165ecef5393e0c8465fdca3911e20cb9bb278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 17:45:15 +0200 Subject: [PATCH 13/22] Wip --- .../sqllogictest/test_files/group_by.slt | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 9de34a763e31..7d87b29cd2eb 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4044,21 +4044,27 @@ logical_plan 03)----SubqueryAlias: lhs 04)------Projection: multiple_ordered_table_with_pk.c, multiple_ordered_table_with_pk.b, sum(multiple_ordered_table_with_pk.d) AS sum1 05)--------Aggregate: groupBy=[[multiple_ordered_table_with_pk.c, multiple_ordered_table_with_pk.b]], aggr=[[sum(CAST(multiple_ordered_table_with_pk.d AS Int64))]] -06)----------TableScan: multiple_ordered_table_with_pk projection=[b, c, d] -07)----SubqueryAlias: rhs -08)------Projection: multiple_ordered_table_with_pk.c, multiple_ordered_table_with_pk.b, sum(multiple_ordered_table_with_pk.d) AS sum1 -09)--------Aggregate: groupBy=[[multiple_ordered_table_with_pk.c, multiple_ordered_table_with_pk.b]], aggr=[[sum(CAST(multiple_ordered_table_with_pk.d AS Int64))]] -10)----------TableScan: multiple_ordered_table_with_pk projection=[b, c, d] +06)----------Filter: multiple_ordered_table_with_pk.b IS NOT NULL +07)------------TableScan: multiple_ordered_table_with_pk projection=[b, c, d], partial_filters=[multiple_ordered_table_with_pk.b IS NOT NULL] +08)----SubqueryAlias: rhs +09)------Projection: multiple_ordered_table_with_pk.c, multiple_ordered_table_with_pk.b, sum(multiple_ordered_table_with_pk.d) AS sum1 +10)--------Aggregate: groupBy=[[multiple_ordered_table_with_pk.c, multiple_ordered_table_with_pk.b]], aggr=[[sum(CAST(multiple_ordered_table_with_pk.d AS Int64))]] +11)----------Filter: multiple_ordered_table_with_pk.b IS NOT NULL +12)------------TableScan: multiple_ordered_table_with_pk projection=[b, c, d], partial_filters=[multiple_ordered_table_with_pk.b IS NOT NULL] physical_plan 01)ProjectionExec: expr=[c@0 as c, c@2 as c, sum1@1 as sum1, sum1@3 as sum1] 02)--CoalesceBatchesExec: target_batch_size=2 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(b@1, b@1)], projection=[c@0, sum1@2, c@3, sum1@5] 04)------ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1] 05)--------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0]) -06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], has_header=true -07)------ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1] -08)--------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0]) -09)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], has_header=true +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: b@0 IS NOT NULL +08)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], has_header=true +09)------ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1] +10)--------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0]) +11)----------CoalesceBatchesExec: target_batch_size=2 +12)------------FilterExec: b@0 IS NOT NULL +13)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], has_header=true query TT EXPLAIN SELECT lhs.c, rhs.c, lhs.sum1, rhs.sum1 From 29f112a8c95e5a140e57d987eb2cd8925a17bcc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 18:06:57 +0200 Subject: [PATCH 14/22] Wip --- .../join_disable_repartition_joins.slt | 20 +++++++++++++------ datafusion/sqllogictest/test_files/joins.slt | 5 +++-- .../sqllogictest/test_files/subquery.slt | 13 +++++++----- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt index c56c59b1bd78..7f3878ae6863 100644 --- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt +++ b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt @@ -50,16 +50,24 @@ logical_plan 02)--Projection: t2.a 03)----Inner Join: t1.c = t2.c 04)------SubqueryAlias: t1 -05)--------TableScan: annotated_data projection=[c] -06)------SubqueryAlias: t2 -07)--------TableScan: annotated_data projection=[a, c] +05)--------Filter: annotated_data.c IS NOT NULL +06)----------TableScan: annotated_data projection=[c], partial_filters=[annotated_data.c IS NOT NULL] +07)------SubqueryAlias: t2 +08)--------Filter: annotated_data.c IS NOT NULL +09)----------TableScan: annotated_data projection=[a, c], partial_filters=[annotated_data.c IS NOT NULL] physical_plan 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST], fetch=5 02)--CoalesceBatchesExec: target_batch_size=8192, fetch=5 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@0, c@1)], projection=[a@1] -04)------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], has_header=true -05)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_ordering=[a@0 ASC NULLS LAST], has_header=true +04)------CoalescePartitionsExec +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------FilterExec: c@0 IS NOT NULL +07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +08)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], has_header=true +09)------CoalesceBatchesExec: target_batch_size=8192 +10)--------FilterExec: c@1 IS NOT NULL +11)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +12)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_ordering=[a@0 ASC NULLS LAST], has_header=true # preserve_inner_join query IIII nosort diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 5ce67278d2ba..793bd5c3efed 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1111,9 +1111,10 @@ WHERE join_t1.t1_int IS NOT NULL ---- logical_plan 01)Inner Join: join_t1.t1_id = join_t2.t2_id -02)--Filter: join_t1.t1_int IS NOT NULL +02)--Filter: join_t1.t1_id IS NOT NULL AND join_t1.t1_int IS NOT NULL 03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -04)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +04)--Filter: join_t2.t2_id IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] # Reduce right join 2 (to inner join) diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index aa39e796cc15..35458050bab2 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -272,7 +272,8 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: sum(t2.t2_int), t2.t2_id 06)--------Aggregate: groupBy=[[t2.t2_id]], aggr=[[sum(CAST(t2.t2_int AS Int64))]] -07)----------TableScan: t2 projection=[t2_id, t2_int] +07)----------Filter: t2.t2_id IS NOT NULL +08)------------TableScan: t2 projection=[t2_id, t2_int] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum] 02)--CoalesceBatchesExec: target_batch_size=2 @@ -282,10 +283,12 @@ physical_plan 06)----------CoalesceBatchesExec: target_batch_size=2 07)------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4 08)--------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)] -09)----------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] -10)------CoalesceBatchesExec: target_batch_size=2 -11)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 -12)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] +09)----------------CoalesceBatchesExec: target_batch_size=2 +10)------------------FilterExec: t2_id@0 IS NOT NULL +11)--------------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] +12)------CoalesceBatchesExec: target_batch_size=2 +13)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 +14)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] query II rowsort SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id group by t2_id, 'a') as t2_sum from t1 From 30db2c9b054f414b5321e6c53c4aec0bacca1138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 18:54:58 +0200 Subject: [PATCH 15/22] Wip --- .../join_disable_repartition_joins.slt | 30 ++++++++++--------- datafusion/sqllogictest/test_files/joins.slt | 6 ++-- .../sqllogictest/test_files/subquery.slt | 13 ++++---- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt index 7f3878ae6863..577729c3bcc1 100644 --- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt +++ b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt @@ -55,19 +55,20 @@ logical_plan 07)------SubqueryAlias: t2 08)--------Filter: annotated_data.c IS NOT NULL 09)----------TableScan: annotated_data projection=[a, c], partial_filters=[annotated_data.c IS NOT NULL] -physical_plan + 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST], fetch=5 02)--CoalesceBatchesExec: target_batch_size=8192, fetch=5 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@0, c@1)], projection=[a@1] 04)------CoalescePartitionsExec -05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------FilterExec: c@0 IS NOT NULL -07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -08)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], has_header=true -09)------CoalesceBatchesExec: target_batch_size=8192 -10)--------FilterExec: c@1 IS NOT NULL -11)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -12)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_ordering=[a@0 ASC NULLS LAST], has_header=true +05)--------CoalescePartitionsExec +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------FilterExec: d@1 IS NOT NULL AND c@0 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], has_header=true +10)--------CoalesceBatchesExec: target_batch_size=8192 +11)----------FilterExec: d@3 IS NOT NULL AND c@2 IS NOT NULL AND d@3 = 3 +12)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +13)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true # preserve_inner_join query IIII nosort @@ -97,11 +98,12 @@ logical_plan 02)--Projection: t2.a AS a2, t2.b 03)----RightSemi Join: t1.d = t2.d, t1.c = t2.c 04)------SubqueryAlias: t1 -05)--------TableScan: annotated_data projection=[c, d] -06)------SubqueryAlias: t2 -07)--------Filter: annotated_data.d = Int32(3) -08)----------TableScan: annotated_data projection=[a, b, c, d], partial_filters=[annotated_data.d = Int32(3)] -physical_plan +05)--------Filter: annotated_data.d IS NOT NULL AND annotated_data.c IS NOT NULL +06)----------TableScan: annotated_data projection=[c, d], partial_filters=[annotated_data.d IS NOT NULL, annotated_data.c IS NOT NULL] +07)------SubqueryAlias: t2 +08)--------Filter: annotated_data.d IS NOT NULL AND annotated_data.c IS NOT NULL AND annotated_data.d = Int32(3) +09)----------TableScan: annotated_data projection=[a, b, c, d], partial_filters=[annotated_data.d = Int32(3), annotated_data.d IS NOT NULL, annotated_data.c IS NOT NULL] + 01)SortPreservingMergeExec: [a2@0 ASC NULLS LAST,b@1 ASC NULLS LAST], fetch=10 02)--ProjectionExec: expr=[a@0 as a2, b@1 as b] 03)----CoalesceBatchesExec: target_batch_size=8192, fetch=10 diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 793bd5c3efed..23d99b5305f6 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1127,8 +1127,10 @@ WHERE NOT (join_t1.t1_int = join_t2.t2_int) ---- logical_plan 01)Inner Join: join_t1.t1_id = join_t2.t2_id Filter: join_t2.t2_int != join_t1.t1_int -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +02)--Filter: join_t1.t1_id IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--Filter: join_t2.t2_id IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] # Reduce full join to right join diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 35458050bab2..493db62286b6 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -310,7 +310,8 @@ logical_plan 05)------Projection: sum(t2.t2_int), t2.t2_id 06)--------Filter: sum(t2.t2_int) < Int64(3) 07)----------Aggregate: groupBy=[[t2.t2_id]], aggr=[[sum(CAST(t2.t2_int AS Int64))]] -08)------------TableScan: t2 projection=[t2_id, t2_int] +08)------------Filter: t2.t2_id IS NOT NULL +09)--------------TableScan: t2 projection=[t2_id, t2_int] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, sum(t2.t2_int)@0 as t2_sum] 02)--CoalesceBatchesExec: target_batch_size=2 @@ -322,10 +323,12 @@ physical_plan 08)--------------CoalesceBatchesExec: target_batch_size=2 09)----------------RepartitionExec: partitioning=Hash([t2_id@0], 4), input_partitions=4 10)------------------AggregateExec: mode=Partial, gby=[t2_id@0 as t2_id], aggr=[sum(t2.t2_int)] -11)--------------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] -12)------CoalesceBatchesExec: target_batch_size=2 -13)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 -14)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] +11)--------------------CoalesceBatchesExec: target_batch_size=2 +12)----------------------FilterExec: t2_id@0 IS NOT NULL +13)------------------------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] +14)------CoalesceBatchesExec: target_batch_size=2 +15)--------RepartitionExec: partitioning=Hash([t1_id@0], 4), input_partitions=4 +16)----------MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0] query II rowsort SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id having sum(t2_int) < 3) as t2_sum from t1 From b2c74124c4fc8fbcfe11a3efc0c311ef58837263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 18:57:10 +0200 Subject: [PATCH 16/22] Wip --- .../sqllogictest/test_files/tpch/q10.slt.part | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/datafusion/sqllogictest/test_files/tpch/q10.slt.part b/datafusion/sqllogictest/test_files/tpch/q10.slt.part index aed5e055e4db..873b635975b4 100644 --- a/datafusion/sqllogictest/test_files/tpch/q10.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q10.slt.part @@ -69,6 +69,25 @@ logical_plan 16)------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], partial_filters=[lineitem.l_returnflag = Utf8("R")] 17)----------TableScan: nation projection=[n_nationkey, n_name] physical_plan +01)SortPreservingMergeExec: [revenue@2 DESC], fetch=10 +02)--SortExec: TopK(fetch=10), expr=[revenue@2 DESC], preserve_partitioning=[true] +03)----ProjectionExec: expr=[c_custkey@0 as c_custkey, c_name@1 as c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@7 as revenue, c_acctbal@2 as c_acctbal, n_name@4 as n_name, c_address@5 as c_address, c_phone@3 as c_phone, c_comment@6 as c_comment] +04)------AggregateExec: mode=FinalPartitioned, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@2 as c_acctbal, c_phone@3 as c_phone, n_name@4 as n_name, c_address@5 as c_address, c_comment@6 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 4), input_partitions=4 +07)------------AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +08)--------------CoalesceBatchesExec: target_batch_size=8192 +09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@7, l_discount@8, n_name@10] +10)------------------CoalesceBatchesExec: target_batch_size=8192 +11)--------------------RepartitionExec: partitioning=Hash([c_nationkey@3], 4), input_partitions=4 +12)----------------------CoalesceBatchesExec: target_batch_size=8192 +13)------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10] +14)--------------------------CoalesceBatchesExec: target_batch_size=8192 +15)----------------------------RepartitionExec: partitioning=Hash([o_orderkey@7], 4), input_partitions=4 +16)------------------------------CoalesceBatchesExec: target_batch_size=8192 +17)--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7] +18)----------------------------------CoalesceBatchesExec: target_batch_size=8192 +19)------------------------------------RepartitionExec: partitioning=Hash([c_custkey@0], 4), input_partitions=4 20)--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 21)----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment], has_header=false 22)----------------------------------CoalesceBatchesExec: target_batch_size=8192 From 001ad1a51cb40e7f83aff7861fbb8c1c4322254e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 18:57:33 +0200 Subject: [PATCH 17/22] Wip --- datafusion/sqllogictest/test_files/tpch/q10.slt.part | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/sqllogictest/test_files/tpch/q10.slt.part b/datafusion/sqllogictest/test_files/tpch/q10.slt.part index 873b635975b4..73593a470c9a 100644 --- a/datafusion/sqllogictest/test_files/tpch/q10.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q10.slt.part @@ -105,6 +105,8 @@ physical_plan 34)----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 35)------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false + + query ITRRTTTT select c_custkey, From 835f94c2aa6359820a4313a151bcf27812a9afba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 20:53:54 +0200 Subject: [PATCH 18/22] Wip --- .../join_disable_repartition_joins.slt | 35 +- datafusion/sqllogictest/test_files/joins.slt | 850 +++++++++++------- .../sqllogictest/test_files/subquery.slt | 150 ++-- .../test_files/tpch/create_tables.slt.part | 2 +- 4 files changed, 642 insertions(+), 395 deletions(-) diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt index 577729c3bcc1..ba68faf5966e 100644 --- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt +++ b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt @@ -55,20 +55,19 @@ logical_plan 07)------SubqueryAlias: t2 08)--------Filter: annotated_data.c IS NOT NULL 09)----------TableScan: annotated_data projection=[a, c], partial_filters=[annotated_data.c IS NOT NULL] - +physical_plan 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST], fetch=5 02)--CoalesceBatchesExec: target_batch_size=8192, fetch=5 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(c@0, c@1)], projection=[a@1] 04)------CoalescePartitionsExec -05)--------CoalescePartitionsExec -06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: d@1 IS NOT NULL AND c@0 IS NOT NULL -08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], has_header=true -10)--------CoalesceBatchesExec: target_batch_size=8192 -11)----------FilterExec: d@3 IS NOT NULL AND c@2 IS NOT NULL AND d@3 = 3 -12)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -13)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true +05)--------CoalesceBatchesExec: target_batch_size=8192 +06)----------FilterExec: c@0 IS NOT NULL +07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +08)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], has_header=true +09)------CoalesceBatchesExec: target_batch_size=8192 +10)--------FilterExec: c@1 IS NOT NULL +11)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +12)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_ordering=[a@0 ASC NULLS LAST], has_header=true # preserve_inner_join query IIII nosort @@ -103,16 +102,20 @@ logical_plan 07)------SubqueryAlias: t2 08)--------Filter: annotated_data.d IS NOT NULL AND annotated_data.c IS NOT NULL AND annotated_data.d = Int32(3) 09)----------TableScan: annotated_data projection=[a, b, c, d], partial_filters=[annotated_data.d = Int32(3), annotated_data.d IS NOT NULL, annotated_data.c IS NOT NULL] - +physical_plan 01)SortPreservingMergeExec: [a2@0 ASC NULLS LAST,b@1 ASC NULLS LAST], fetch=10 02)--ProjectionExec: expr=[a@0 as a2, b@1 as b] 03)----CoalesceBatchesExec: target_batch_size=8192, fetch=10 04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(d@1, d@3), (c@0, c@2)], projection=[a@0, b@1] -05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], has_header=true -06)--------CoalesceBatchesExec: target_batch_size=8192 -07)----------FilterExec: d@3 = 3 -08)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -09)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true +05)--------CoalescePartitionsExec +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------FilterExec: d@1 IS NOT NULL AND c@0 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +09)----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], has_header=true +10)--------CoalesceBatchesExec: target_batch_size=8192 +11)----------FilterExec: d@3 IS NOT NULL AND c@2 IS NOT NULL AND d@3 = 3 +12)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +13)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true # preserve_right_semi_join query II nosort diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 23d99b5305f6..15896a3dc319 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -1143,9 +1143,10 @@ WHERE join_t2.t2_name IS NOT NULL ---- logical_plan 01)Right Join: join_t1.t1_id = join_t2.t2_id -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--Filter: join_t2.t2_name IS NOT NULL -04)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +02)--Filter: join_t1.t1_id IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--Filter: join_t2.t2_name IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] # Reduce full join to left join @@ -1160,7 +1161,8 @@ logical_plan 01)Left Join: join_t1.t1_id = join_t2.t2_id 02)--Filter: join_t1.t1_name != Utf8("b") 03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -04)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +04)--Filter: join_t2.t2_id IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] # Reduce full join to inner join @@ -1173,9 +1175,9 @@ WHERE join_t1.t1_name != 'b' and join_t2.t2_name = 'x' ---- logical_plan 01)Inner Join: join_t1.t1_id = join_t2.t2_id -02)--Filter: join_t1.t1_name != Utf8("b") +02)--Filter: join_t1.t1_id IS NOT NULL AND join_t1.t1_name != Utf8("b") 03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -04)--Filter: join_t2.t2_name = Utf8("x") +04)--Filter: join_t2.t2_id IS NOT NULL AND join_t2.t2_name = Utf8("x") 05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] ### @@ -1225,10 +1227,11 @@ LEFT SEMI JOIN lsaj_t2 ON (lsaj_t1.t1_id = lsaj_t2.t2_id and lsaj_t2.t2_int > 1) ---- logical_plan 01)LeftSemi Join: lsaj_t1.t1_id = lsaj_t2.t2_id -02)--TableScan: lsaj_t1 projection=[t1_id, t1_name] -03)--Projection: lsaj_t2.t2_id -04)----Filter: lsaj_t2.t2_int > UInt32(1) -05)------TableScan: lsaj_t2 projection=[t2_id, t2_int] +02)--Filter: lsaj_t1.t1_id IS NOT NULL +03)----TableScan: lsaj_t1 projection=[t1_id, t1_name] +04)--Projection: lsaj_t2.t2_id +05)----Filter: lsaj_t2.t2_id IS NOT NULL AND lsaj_t2.t2_int > UInt32(1) +06)------TableScan: lsaj_t2 projection=[t2_id, t2_int] # Left anti join @@ -1340,20 +1343,26 @@ logical_plan 01)Aggregate: groupBy=[[join_t1.t1_id]], aggr=[[]] 02)--Projection: join_t1.t1_id 03)----Inner Join: join_t1.t1_id = join_t2.t2_id -04)------TableScan: join_t1 projection=[t1_id] -05)------TableScan: join_t2 projection=[t2_id] +04)------Filter: join_t1.t1_id IS NOT NULL +05)--------TableScan: join_t1 projection=[t1_id] +06)------Filter: join_t2.t2_id IS NOT NULL +07)--------TableScan: join_t2 projection=[t2_id] physical_plan 01)AggregateExec: mode=SinglePartitioned, gby=[t1_id@0 as t1_id], aggr=[] 02)--CoalesceBatchesExec: target_batch_size=2 03)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0] 04)------CoalesceBatchesExec: target_batch_size=2 05)--------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)------------MemoryExec: partitions=1, partition_sizes=[1] -08)------CoalesceBatchesExec: target_batch_size=2 -09)--------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 -10)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -11)------------MemoryExec: partitions=1, partition_sizes=[1] +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: t1_id@0 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)------CoalesceBatchesExec: target_batch_size=2 +11)--------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 +12)----------CoalesceBatchesExec: target_batch_size=2 +13)------------FilterExec: t2_id@0 IS NOT NULL +14)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +15)----------------MemoryExec: partitions=1, partition_sizes=[1] # Join on struct query TT @@ -1363,19 +1372,25 @@ inner join join_t4 on join_t3.s3 = join_t4.s4 ---- logical_plan 01)Inner Join: join_t3.s3 = join_t4.s4 -02)--TableScan: join_t3 projection=[s3] -03)--TableScan: join_t4 projection=[s4] +02)--Filter: join_t3.s3 IS NOT NULL +03)----TableScan: join_t3 projection=[s3] +04)--Filter: join_t4.s4 IS NOT NULL +05)----TableScan: join_t4 projection=[s4] physical_plan 01)CoalesceBatchesExec: target_batch_size=2 02)--HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s3@0, s4@0)] 03)----CoalesceBatchesExec: target_batch_size=2 04)------RepartitionExec: partitioning=Hash([s3@0], 2), input_partitions=2 -05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -06)----------MemoryExec: partitions=1, partition_sizes=[1] -07)----CoalesceBatchesExec: target_batch_size=2 -08)------RepartitionExec: partitioning=Hash([s4@0], 2), input_partitions=2 -09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -10)----------MemoryExec: partitions=1, partition_sizes=[1] +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: s3@0 IS NOT NULL +07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +08)--------------MemoryExec: partitions=1, partition_sizes=[1] +09)----CoalesceBatchesExec: target_batch_size=2 +10)------RepartitionExec: partitioning=Hash([s4@0], 2), input_partitions=2 +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------FilterExec: s4@0 IS NOT NULL +13)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +14)--------------MemoryExec: partitions=1, partition_sizes=[1] query ?? select join_t3.s3, join_t4.s4 @@ -1404,8 +1419,10 @@ logical_plan 02)--Aggregate: groupBy=[[join_t1.t1_id]], aggr=[[count(Int64(1)) AS count(*)]] 03)----Projection: join_t1.t1_id 04)------Inner Join: join_t1.t1_id = join_t2.t2_id -05)--------TableScan: join_t1 projection=[t1_id] -06)--------TableScan: join_t2 projection=[t2_id] +05)--------Filter: join_t1.t1_id IS NOT NULL +06)----------TableScan: join_t1 projection=[t1_id] +07)--------Filter: join_t2.t2_id IS NOT NULL +08)----------TableScan: join_t2 projection=[t2_id] physical_plan 01)ProjectionExec: expr=[count(*)@1 as count(*)] 02)--AggregateExec: mode=SinglePartitioned, gby=[t1_id@0 as t1_id], aggr=[count(*)] @@ -1413,12 +1430,16 @@ physical_plan 04)------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0] 05)--------CoalesceBatchesExec: target_batch_size=2 06)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)--------CoalesceBatchesExec: target_batch_size=2 -10)----------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 -11)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -12)--------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: t1_id@0 IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 +13)------------CoalesceBatchesExec: target_batch_size=2 +14)--------------FilterExec: t2_id@0 IS NOT NULL +15)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +16)------------------MemoryExec: partitions=1, partition_sizes=[1] query TT EXPLAIN @@ -1432,8 +1453,10 @@ logical_plan 03)----Aggregate: groupBy=[[join_t1.t1_id AS alias1]], aggr=[[]] 04)------Projection: join_t1.t1_id 05)--------Inner Join: join_t1.t1_id = join_t2.t2_id -06)----------TableScan: join_t1 projection=[t1_id] -07)----------TableScan: join_t2 projection=[t2_id] +06)----------Filter: join_t1.t1_id IS NOT NULL +07)------------TableScan: join_t1 projection=[t1_id] +08)----------Filter: join_t2.t2_id IS NOT NULL +09)------------TableScan: join_t2 projection=[t2_id] physical_plan 01)ProjectionExec: expr=[count(alias1)@0 as count(DISTINCT join_t1.t1_id)] 02)--AggregateExec: mode=Final, gby=[], aggr=[count(alias1)] @@ -1444,12 +1467,16 @@ physical_plan 07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(t1_id@0, t2_id@0)], projection=[t1_id@0] 08)--------------CoalesceBatchesExec: target_batch_size=2 09)----------------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -10)------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -11)--------------------MemoryExec: partitions=1, partition_sizes=[1] -12)--------------CoalesceBatchesExec: target_batch_size=2 -13)----------------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 -14)------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -15)--------------------MemoryExec: partitions=1, partition_sizes=[1] +10)------------------CoalesceBatchesExec: target_batch_size=2 +11)--------------------FilterExec: t1_id@0 IS NOT NULL +12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +13)------------------------MemoryExec: partitions=1, partition_sizes=[1] +14)--------------CoalesceBatchesExec: target_batch_size=2 +15)----------------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 +16)------------------CoalesceBatchesExec: target_batch_size=2 +17)--------------------FilterExec: t2_id@0 IS NOT NULL +18)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +19)------------------------MemoryExec: partitions=1, partition_sizes=[1] statement ok set datafusion.explain.logical_plan_only = true; @@ -1465,8 +1492,10 @@ where join_t1.t1_id + 12 = join_t2.t2_id + 1 ---- logical_plan 01)Inner Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = CAST(join_t2.t2_id AS Int64) + Int64(1) -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +02)--Filter: CAST(join_t1.t1_id AS Int64) + Int64(12) IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--Filter: CAST(join_t2.t2_id AS Int64) + Int64(1) IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] # Reduce cross join with cast expr join key (to inner join) @@ -1479,8 +1508,10 @@ cross join join_t2 where join_t1.t1_id + 11 = cast(join_t2.t2_id as BIGINT) logical_plan 01)Projection: join_t1.t1_id, join_t2.t2_id, join_t1.t1_name 02)--Inner Join: CAST(join_t1.t1_id AS Int64) + Int64(11) = CAST(join_t2.t2_id AS Int64) -03)----TableScan: join_t1 projection=[t1_id, t1_name] -04)----TableScan: join_t2 projection=[t2_id] +03)----Filter: CAST(join_t1.t1_id AS Int64) + Int64(11) IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name] +05)----Filter: CAST(join_t2.t2_id AS Int64) IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id] ##### @@ -1504,19 +1535,25 @@ where join_t1.t1_id + 11 = join_t2.t2_id logical_plan 01)Projection: join_t1.t1_id, join_t1.t1_name, join_t1.t1_int, join_t2.t2_id, join_t2.t2_name, join_t2.t2_int, CAST(join_t1.t1_id AS Int64) + Int64(11) 02)--Inner Join: CAST(join_t1.t1_id AS Int64) + Int64(11) = CAST(join_t2.t2_id AS Int64) -03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -04)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +03)----Filter: CAST(join_t1.t1_id AS Int64) + Int64(11) IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +05)----Filter: CAST(join_t2.t2_id AS Int64) IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id, t2_name, t2_int] physical_plan 01)ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)] 02)--CoalesceBatchesExec: target_batch_size=2 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t1.t1_id + Int64(11)@3, CAST(join_t2.t2_id AS Int64)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@4, t2_name@5, t2_int@6] 04)------CoalescePartitionsExec 05)--------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)] -06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)------------MemoryExec: partitions=1, partition_sizes=[1] -08)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)] -09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -10)----------MemoryExec: partitions=1, partition_sizes=[1] +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: CAST(t1_id@0 AS Int64) + 11 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)] +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------FilterExec: CAST(t2_id@0 AS Int64) IS NOT NULL +13)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +14)--------------MemoryExec: partitions=1, partition_sizes=[1] statement ok set datafusion.optimizer.repartition_joins = true; @@ -1530,8 +1567,10 @@ where join_t1.t1_id + 11 = join_t2.t2_id logical_plan 01)Projection: join_t1.t1_id, join_t1.t1_name, join_t1.t1_int, join_t2.t2_id, join_t2.t2_name, join_t2.t2_int, CAST(join_t1.t1_id AS Int64) + Int64(11) 02)--Inner Join: CAST(join_t1.t1_id AS Int64) + Int64(11) = CAST(join_t2.t2_id AS Int64) -03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -04)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +03)----Filter: CAST(join_t1.t1_id AS Int64) + Int64(11) IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +05)----Filter: CAST(join_t2.t2_id AS Int64) IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id, t2_name, t2_int] physical_plan 01)ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, t2_id@3 as t2_id, t2_name@4 as t2_name, t2_int@5 as t2_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)] 02)--CoalesceBatchesExec: target_batch_size=2 @@ -1539,13 +1578,17 @@ physical_plan 04)------CoalesceBatchesExec: target_batch_size=2 05)--------RepartitionExec: partitioning=Hash([join_t1.t1_id + Int64(11)@3], 2), input_partitions=2 06)----------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_int@2 as t1_int, CAST(t1_id@0 AS Int64) + 11 as join_t1.t1_id + Int64(11)] -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)------CoalesceBatchesExec: target_batch_size=2 -10)--------RepartitionExec: partitioning=Hash([CAST(join_t2.t2_id AS Int64)@3], 2), input_partitions=2 -11)----------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)] -12)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -13)--------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: CAST(t1_id@0 AS Int64) + 11 IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)------CoalesceBatchesExec: target_batch_size=2 +12)--------RepartitionExec: partitioning=Hash([CAST(join_t2.t2_id AS Int64)@3], 2), input_partitions=2 +13)----------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, CAST(t2_id@0 AS Int64) as CAST(join_t2.t2_id AS Int64)] +14)------------CoalesceBatchesExec: target_batch_size=2 +15)--------------FilterExec: CAST(t2_id@0 AS Int64) IS NOT NULL +16)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +17)------------------MemoryExec: partitions=1, partition_sizes=[1] # Both side expr key inner join @@ -1561,19 +1604,25 @@ INNER JOIN join_t2 ON join_t1.t1_id + cast(12 as INT UNSIGNED) = join_t2.t2_id + logical_plan 01)Projection: join_t1.t1_id, join_t2.t2_id, join_t1.t1_name 02)--Inner Join: join_t1.t1_id + UInt32(12) = join_t2.t2_id + UInt32(1) -03)----TableScan: join_t1 projection=[t1_id, t1_name] -04)----TableScan: join_t2 projection=[t2_id] +03)----Filter: join_t1.t1_id + UInt32(12) IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name] +05)----Filter: join_t2.t2_id + UInt32(1) IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name] 02)--CoalesceBatchesExec: target_batch_size=2 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id + UInt32(1)@1, join_t1.t1_id + UInt32(12)@2)], projection=[t2_id@0, t1_id@2, t1_name@3] 04)------CoalescePartitionsExec 05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)] -06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)------------MemoryExec: partitions=1, partition_sizes=[1] -08)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)] -09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -10)----------MemoryExec: partitions=1, partition_sizes=[1] +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: t2_id@0 + 1 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)] +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------FilterExec: t1_id@0 + 12 IS NOT NULL +13)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +14)--------------MemoryExec: partitions=1, partition_sizes=[1] statement ok set datafusion.optimizer.repartition_joins = true; @@ -1587,8 +1636,10 @@ INNER JOIN join_t2 ON join_t1.t1_id + cast(12 as INT UNSIGNED) = join_t2.t2_id + logical_plan 01)Projection: join_t1.t1_id, join_t2.t2_id, join_t1.t1_name 02)--Inner Join: join_t1.t1_id + UInt32(12) = join_t2.t2_id + UInt32(1) -03)----TableScan: join_t1 projection=[t1_id, t1_name] -04)----TableScan: join_t2 projection=[t2_id] +03)----Filter: join_t1.t1_id + UInt32(12) IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name] +05)----Filter: join_t2.t2_id + UInt32(1) IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name] 02)--CoalesceBatchesExec: target_batch_size=2 @@ -1596,13 +1647,17 @@ physical_plan 04)------CoalesceBatchesExec: target_batch_size=2 05)--------RepartitionExec: partitioning=Hash([join_t2.t2_id + UInt32(1)@1], 2), input_partitions=2 06)----------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 + 1 as join_t2.t2_id + UInt32(1)] -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)------CoalesceBatchesExec: target_batch_size=2 -10)--------RepartitionExec: partitioning=Hash([join_t1.t1_id + UInt32(12)@2], 2), input_partitions=2 -11)----------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)] -12)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -13)--------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: t2_id@0 + 1 IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)------CoalesceBatchesExec: target_batch_size=2 +12)--------RepartitionExec: partitioning=Hash([join_t1.t1_id + UInt32(12)@2], 2), input_partitions=2 +13)----------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 12 as join_t1.t1_id + UInt32(12)] +14)------------CoalesceBatchesExec: target_batch_size=2 +15)--------------FilterExec: t1_id@0 + 12 IS NOT NULL +16)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +17)------------------MemoryExec: partitions=1, partition_sizes=[1] # Left side expr key inner join @@ -1619,16 +1674,24 @@ ON join_t1.t1_id + cast(11 as INT UNSIGNED) = join_t2.t2_id logical_plan 01)Projection: join_t1.t1_id, join_t2.t2_id, join_t1.t1_name 02)--Inner Join: join_t1.t1_id + UInt32(11) = join_t2.t2_id -03)----TableScan: join_t1 projection=[t1_id, t1_name] -04)----TableScan: join_t2 projection=[t2_id] +03)----Filter: join_t1.t1_id + UInt32(11) IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name] +05)----Filter: join_t2.t2_id IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name] 02)--CoalesceBatchesExec: target_batch_size=2 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t2_id@0, t1_id@1, t1_name@2] -04)------MemoryExec: partitions=1, partition_sizes=[1] -05)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)] -06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)----------MemoryExec: partitions=1, partition_sizes=[1] +04)------CoalescePartitionsExec +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: t2_id@0 IS NOT NULL +07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +08)--------------MemoryExec: partitions=1, partition_sizes=[1] +09)------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)] +10)--------CoalesceBatchesExec: target_batch_size=2 +11)----------FilterExec: t1_id@0 + 11 IS NOT NULL +12)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +13)--------------MemoryExec: partitions=1, partition_sizes=[1] statement ok set datafusion.optimizer.repartition_joins = true; @@ -1643,21 +1706,27 @@ ON join_t1.t1_id + cast(11 as INT UNSIGNED) = join_t2.t2_id logical_plan 01)Projection: join_t1.t1_id, join_t2.t2_id, join_t1.t1_name 02)--Inner Join: join_t1.t1_id + UInt32(11) = join_t2.t2_id -03)----TableScan: join_t1 projection=[t1_id, t1_name] -04)----TableScan: join_t2 projection=[t2_id] +03)----Filter: join_t1.t1_id + UInt32(11) IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name] +05)----Filter: join_t2.t2_id IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name] 02)--CoalesceBatchesExec: target_batch_size=2 03)----HashJoinExec: mode=Partitioned, join_type=Inner, on=[(t2_id@0, join_t1.t1_id + UInt32(11)@2)], projection=[t2_id@0, t1_id@1, t1_name@2] 04)------CoalesceBatchesExec: target_batch_size=2 05)--------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 -06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)------------MemoryExec: partitions=1, partition_sizes=[1] -08)------CoalesceBatchesExec: target_batch_size=2 -09)--------RepartitionExec: partitioning=Hash([join_t1.t1_id + UInt32(11)@2], 2), input_partitions=2 -10)----------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)] -11)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -12)--------------MemoryExec: partitions=1, partition_sizes=[1] +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: t2_id@0 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)------CoalesceBatchesExec: target_batch_size=2 +11)--------RepartitionExec: partitioning=Hash([join_t1.t1_id + UInt32(11)@2], 2), input_partitions=2 +12)----------ProjectionExec: expr=[t1_id@0 as t1_id, t1_name@1 as t1_name, t1_id@0 + 11 as join_t1.t1_id + UInt32(11)] +13)------------CoalesceBatchesExec: target_batch_size=2 +14)--------------FilterExec: t1_id@0 + 11 IS NOT NULL +15)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +16)------------------MemoryExec: partitions=1, partition_sizes=[1] # Right side expr key inner join @@ -1674,18 +1743,24 @@ ON join_t1.t1_id = join_t2.t2_id - cast(11 as INT UNSIGNED) logical_plan 01)Projection: join_t1.t1_id, join_t2.t2_id, join_t1.t1_name 02)--Inner Join: join_t1.t1_id = join_t2.t2_id - UInt32(11) -03)----TableScan: join_t1 projection=[t1_id, t1_name] -04)----TableScan: join_t2 projection=[t2_id] +03)----Filter: join_t1.t1_id IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name] +05)----Filter: join_t2.t2_id - UInt32(11) IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name] 02)--CoalesceBatchesExec: target_batch_size=2 03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(join_t2.t2_id - UInt32(11)@1, t1_id@0)], projection=[t2_id@0, t1_id@2, t1_name@3] 04)------CoalescePartitionsExec 05)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)] -06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)------------MemoryExec: partitions=1, partition_sizes=[1] -08)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -09)--------MemoryExec: partitions=1, partition_sizes=[1] +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: t2_id@0 - 11 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)------CoalesceBatchesExec: target_batch_size=2 +11)--------FilterExec: t1_id@0 IS NOT NULL +12)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +13)------------MemoryExec: partitions=1, partition_sizes=[1] statement ok set datafusion.optimizer.repartition_joins = true; @@ -1700,8 +1775,10 @@ ON join_t1.t1_id = join_t2.t2_id - cast(11 as INT UNSIGNED) logical_plan 01)Projection: join_t1.t1_id, join_t2.t2_id, join_t1.t1_name 02)--Inner Join: join_t1.t1_id = join_t2.t2_id - UInt32(11) -03)----TableScan: join_t1 projection=[t1_id, t1_name] -04)----TableScan: join_t2 projection=[t2_id] +03)----Filter: join_t1.t1_id IS NOT NULL +04)------TableScan: join_t1 projection=[t1_id, t1_name] +05)----Filter: join_t2.t2_id - UInt32(11) IS NOT NULL +06)------TableScan: join_t2 projection=[t2_id] physical_plan 01)ProjectionExec: expr=[t1_id@1 as t1_id, t2_id@0 as t2_id, t1_name@2 as t1_name] 02)--CoalesceBatchesExec: target_batch_size=2 @@ -1709,12 +1786,16 @@ physical_plan 04)------CoalesceBatchesExec: target_batch_size=2 05)--------RepartitionExec: partitioning=Hash([join_t2.t2_id - UInt32(11)@1], 2), input_partitions=2 06)----------ProjectionExec: expr=[t2_id@0 as t2_id, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)] -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)------CoalesceBatchesExec: target_batch_size=2 -10)--------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -11)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -12)------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: t2_id@0 - 11 IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)------CoalesceBatchesExec: target_batch_size=2 +12)--------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 +13)----------CoalesceBatchesExec: target_batch_size=2 +14)------------FilterExec: t1_id@0 IS NOT NULL +15)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +16)----------------MemoryExec: partitions=1, partition_sizes=[1] # Select wildcard with expr key inner join @@ -1730,15 +1811,23 @@ ON join_t1.t1_id = join_t2.t2_id - cast(11 as INT UNSIGNED) ---- logical_plan 01)Inner Join: join_t1.t1_id = join_t2.t2_id - UInt32(11) -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +02)--Filter: join_t1.t1_id IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--Filter: join_t2.t2_id - UInt32(11) IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] physical_plan 01)CoalesceBatchesExec: target_batch_size=2 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5] -03)----MemoryExec: partitions=1, partition_sizes=[1] -04)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)] -05)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -06)--------MemoryExec: partitions=1, partition_sizes=[1] +03)----CoalescePartitionsExec +04)------CoalesceBatchesExec: target_batch_size=2 +05)--------FilterExec: t1_id@0 IS NOT NULL +06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +07)------------MemoryExec: partitions=1, partition_sizes=[1] +08)----ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)] +09)------CoalesceBatchesExec: target_batch_size=2 +10)--------FilterExec: t2_id@0 - 11 IS NOT NULL +11)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +12)------------MemoryExec: partitions=1, partition_sizes=[1] statement ok set datafusion.optimizer.repartition_joins = true; @@ -1752,20 +1841,26 @@ ON join_t1.t1_id = join_t2.t2_id - cast(11 as INT UNSIGNED) ---- logical_plan 01)Inner Join: join_t1.t1_id = join_t2.t2_id - UInt32(11) -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +02)--Filter: join_t1.t1_id IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--Filter: join_t2.t2_id - UInt32(11) IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id, t2_name, t2_int] physical_plan 01)CoalesceBatchesExec: target_batch_size=2 02)--HashJoinExec: mode=Partitioned, join_type=Inner, on=[(t1_id@0, join_t2.t2_id - UInt32(11)@3)], projection=[t1_id@0, t1_name@1, t1_int@2, t2_id@3, t2_name@4, t2_int@5] 03)----CoalesceBatchesExec: target_batch_size=2 04)------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -06)----------MemoryExec: partitions=1, partition_sizes=[1] -07)----CoalesceBatchesExec: target_batch_size=2 -08)------RepartitionExec: partitioning=Hash([join_t2.t2_id - UInt32(11)@3], 2), input_partitions=2 -09)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)] -10)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -11)------------MemoryExec: partitions=1, partition_sizes=[1] +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: t1_id@0 IS NOT NULL +07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +08)--------------MemoryExec: partitions=1, partition_sizes=[1] +09)----CoalesceBatchesExec: target_batch_size=2 +10)------RepartitionExec: partitioning=Hash([join_t2.t2_id - UInt32(11)@3], 2), input_partitions=2 +11)--------ProjectionExec: expr=[t2_id@0 as t2_id, t2_name@1 as t2_name, t2_int@2 as t2_int, t2_id@0 - 11 as join_t2.t2_id - UInt32(11)] +12)----------CoalesceBatchesExec: target_batch_size=2 +13)------------FilterExec: t2_id@0 - 11 IS NOT NULL +14)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +15)----------------MemoryExec: partitions=1, partition_sizes=[1] ##### # Config teardown @@ -1785,8 +1880,10 @@ inner join join_t2 on join_t1.t1_id + 11 = join_t2.t2_id ---- logical_plan 01)Inner Join: CAST(join_t1.t1_id AS Int64) + Int64(11) = CAST(join_t2.t2_id AS Int64) -02)--TableScan: join_t1 projection=[t1_id, t1_name] -03)--TableScan: join_t2 projection=[t2_id] +02)--Filter: CAST(join_t1.t1_id AS Int64) + Int64(11) IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name] +04)--Filter: CAST(join_t2.t2_id AS Int64) IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id] # Join only with filter @@ -1812,8 +1909,10 @@ on join_t1.t1_id * 5 = join_t2.t2_id and join_t1.t1_id * 4 < join_t2.t2_id ---- logical_plan 01)Inner Join: CAST(join_t1.t1_id AS Int64) * Int64(5) = CAST(join_t2.t2_id AS Int64) Filter: CAST(join_t1.t1_id AS Int64) * Int64(4) < CAST(join_t2.t2_id AS Int64) -02)--TableScan: join_t1 projection=[t1_id, t1_name] -03)--TableScan: join_t2 projection=[t2_id] +02)--Filter: CAST(join_t1.t1_id AS Int64) * Int64(5) IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name] +04)--Filter: CAST(join_t2.t2_id AS Int64) IS NOT NULL +05)----TableScan: join_t2 projection=[t2_id] # Test cross join to groupby with different key ordering @@ -1867,10 +1966,12 @@ where join_t1.t1_id + 12 in (select join_t2.t2_id + 1 from join_t2) ---- logical_plan 01)LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.join_t2.t2_id + Int64(1) -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) -05)------TableScan: join_t2 projection=[t2_id] +02)--Filter: CAST(join_t1.t1_id AS Int64) + Int64(12) IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--SubqueryAlias: __correlated_sq_1 +05)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) +06)------Filter: CAST(join_t2.t2_id AS Int64) + Int64(1) IS NOT NULL +07)--------TableScan: join_t2 projection=[t2_id] query ITI rowsort select join_t1.t1_id, join_t1.t1_name, join_t1.t1_int @@ -1896,11 +1997,12 @@ where join_t1.t1_id + 12 in ---- logical_plan 01)LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.join_t2.t2_id + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_1.t2_int -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1), join_t2.t2_int -05)------Filter: join_t2.t2_int > UInt32(0) -06)--------TableScan: join_t2 projection=[t2_id, t2_int] +02)--Filter: CAST(join_t1.t1_id AS Int64) + Int64(12) IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--SubqueryAlias: __correlated_sq_1 +05)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1), join_t2.t2_int +06)------Filter: CAST(join_t2.t2_id AS Int64) + Int64(1) IS NOT NULL AND join_t2.t2_int > UInt32(0) +07)--------TableScan: join_t2 projection=[t2_id, t2_int] query ITI rowsort select join_t1.t1_id, join_t1.t1_name, join_t1.t1_int @@ -1932,11 +2034,12 @@ where join_t1.t1_id + 12 in ---- logical_plan 01)LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.join_t2.t2_id + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_1.t2_int AND join_t1.t1_name != __correlated_sq_1.t2_name -02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] -03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1), join_t2.t2_int, join_t2.t2_name -05)------Filter: join_t2.t2_int > UInt32(0) -06)--------TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +02)--Filter: CAST(join_t1.t1_id AS Int64) + Int64(12) IS NOT NULL +03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] +04)--SubqueryAlias: __correlated_sq_1 +05)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1), join_t2.t2_int, join_t2.t2_name +06)------Filter: CAST(join_t2.t2_id AS Int64) + Int64(1) IS NOT NULL AND join_t2.t2_int > UInt32(0) +07)--------TableScan: join_t2 projection=[t2_id, t2_name, t2_int] query ITI rowsort select join_t1.t1_id, join_t1.t1_name, join_t1.t1_int @@ -1964,11 +2067,12 @@ where join_t1.t1_id + 12 in ---- logical_plan 01)LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.join_t2.t2_id + Int64(1) -02)--Filter: join_t1.t1_int > UInt32(0) +02)--Filter: CAST(join_t1.t1_id AS Int64) + Int64(12) IS NOT NULL AND join_t1.t1_int > UInt32(0) 03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] 04)--SubqueryAlias: __correlated_sq_1 05)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) -06)------TableScan: join_t2 projection=[t2_id] +06)------Filter: CAST(join_t2.t2_id AS Int64) + Int64(1) IS NOT NULL +07)--------TableScan: join_t2 projection=[t2_id] # Not in subquery to join with correlated outer filter @@ -1984,7 +2088,8 @@ logical_plan 02)--TableScan: join_t1 projection=[t1_id, t1_name, t1_int] 03)--SubqueryAlias: __correlated_sq_1 04)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) -05)------TableScan: join_t2 projection=[t2_id] +05)------Filter: CAST(join_t2.t2_id AS Int64) + Int64(1) IS NOT NULL +06)--------TableScan: join_t2 projection=[t2_id] # In subquery to join with outer filter @@ -2003,11 +2108,12 @@ where join_t1.t1_id + 12 in ---- logical_plan 01)LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.join_t2.t2_id + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_1.t2_int AND join_t1.t1_name != __correlated_sq_1.t2_name -02)--Filter: join_t1.t1_id > UInt32(0) +02)--Filter: CAST(join_t1.t1_id AS Int64) + Int64(12) IS NOT NULL AND join_t1.t1_id > UInt32(0) 03)----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] 04)--SubqueryAlias: __correlated_sq_1 05)----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1), join_t2.t2_int, join_t2.t2_name -06)------TableScan: join_t2 projection=[t2_id, t2_name, t2_int] +06)------Filter: CAST(join_t2.t2_id AS Int64) + Int64(1) IS NOT NULL +07)--------TableScan: join_t2 projection=[t2_id, t2_name, t2_int] query ITI rowsort select join_t1.t1_id, join_t1.t1_name, join_t1.t1_int @@ -2037,14 +2143,16 @@ where join_t1.t1_id + 12 in (select join_t2.t2_id + 1 from join_t2) logical_plan 01)LeftSemi Join: CAST(join_t1.t1_int AS Int64) = __correlated_sq_2.join_t2.t2_int + Int64(1) 02)--LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.join_t2.t2_id + Int64(1) -03)----Filter: join_t1.t1_id > UInt32(0) +03)----Filter: CAST(join_t1.t1_int AS Int64) IS NOT NULL AND CAST(join_t1.t1_id AS Int64) + Int64(12) IS NOT NULL AND join_t1.t1_id > UInt32(0) 04)------TableScan: join_t1 projection=[t1_id, t1_name, t1_int] 05)----SubqueryAlias: __correlated_sq_1 06)------Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) -07)--------TableScan: join_t2 projection=[t2_id] -08)--SubqueryAlias: __correlated_sq_2 -09)----Projection: CAST(join_t2.t2_int AS Int64) + Int64(1) -10)------TableScan: join_t2 projection=[t2_int] +07)--------Filter: CAST(join_t2.t2_id AS Int64) + Int64(1) IS NOT NULL +08)----------TableScan: join_t2 projection=[t2_id] +09)--SubqueryAlias: __correlated_sq_2 +10)----Projection: CAST(join_t2.t2_int AS Int64) + Int64(1) +11)------Filter: CAST(join_t2.t2_int AS Int64) + Int64(1) IS NOT NULL +12)--------TableScan: join_t2 projection=[t2_int] query ITI select join_t1.t1_id, join_t1.t1_name, join_t1.t1_int @@ -2598,20 +2706,26 @@ EXPLAIN SELECT * FROM test_timestamps_tz_table as t1 JOIN test_timestamps_tz_tab logical_plan 01)Inner Join: t1.millis = t2.millis 02)--SubqueryAlias: t1 -03)----TableScan: test_timestamps_tz_table projection=[nanos, micros, millis, secs, names] -04)--SubqueryAlias: t2 -05)----TableScan: test_timestamps_tz_table projection=[nanos, micros, millis, secs, names] +03)----Filter: test_timestamps_tz_table.millis IS NOT NULL +04)------TableScan: test_timestamps_tz_table projection=[nanos, micros, millis, secs, names] +05)--SubqueryAlias: t2 +06)----Filter: test_timestamps_tz_table.millis IS NOT NULL +07)------TableScan: test_timestamps_tz_table projection=[nanos, micros, millis, secs, names] physical_plan 01)CoalesceBatchesExec: target_batch_size=2 02)--HashJoinExec: mode=Partitioned, join_type=Inner, on=[(millis@2, millis@2)] 03)----CoalesceBatchesExec: target_batch_size=2 04)------RepartitionExec: partitioning=Hash([millis@2], 2), input_partitions=2 -05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -06)----------MemoryExec: partitions=1, partition_sizes=[1] -07)----CoalesceBatchesExec: target_batch_size=2 -08)------RepartitionExec: partitioning=Hash([millis@2], 2), input_partitions=2 -09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -10)----------MemoryExec: partitions=1, partition_sizes=[1] +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: millis@2 IS NOT NULL +07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +08)--------------MemoryExec: partitions=1, partition_sizes=[1] +09)----CoalesceBatchesExec: target_batch_size=2 +10)------RepartitionExec: partitioning=Hash([millis@2], 2), input_partitions=2 +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------FilterExec: millis@2 IS NOT NULL +13)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +14)--------------MemoryExec: partitions=1, partition_sizes=[1] # left_join_using_2 query II @@ -2672,9 +2786,11 @@ explain select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table logical_plan 01)Inner Join: t1.c1 = t2.c1 02)--SubqueryAlias: t1 -03)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] -04)--SubqueryAlias: t2 -05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] +03)----Filter: hashjoin_datatype_table_t1.c1 IS NOT NULL +04)------TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] +05)--SubqueryAlias: t2 +06)----Filter: hashjoin_datatype_table_t2.c1 IS NOT NULL +07)------TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] # hash_join_with_date32 query DDR?DDR? rowsort @@ -2693,7 +2809,8 @@ logical_plan 02)--SubqueryAlias: t1 03)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] 04)--SubqueryAlias: t2 -05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] +05)----Filter: hashjoin_datatype_table_t2.c2 IS NOT NULL +06)------TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] # hash_join_with_date64 query DDR?DDR? rowsort @@ -2712,9 +2829,10 @@ explain select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype logical_plan 01)Right Join: t1.c3 = t2.c3 02)--SubqueryAlias: t1 -03)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] -04)--SubqueryAlias: t2 -05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] +03)----Filter: hashjoin_datatype_table_t1.c3 IS NOT NULL +04)------TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] +05)--SubqueryAlias: t2 +06)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] # hash_join_with_decimal query DDR?DDR? rowsort @@ -2732,9 +2850,11 @@ explain select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table logical_plan 01)Inner Join: t1.c4 = t2.c4 02)--SubqueryAlias: t1 -03)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] -04)--SubqueryAlias: t2 -05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] +03)----Filter: hashjoin_datatype_table_t1.c4 IS NOT NULL +04)------TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] +05)--SubqueryAlias: t2 +06)----Filter: hashjoin_datatype_table_t1.c4 IS NOT NULL +07)------TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] # hash_join_with_dictionary query DDR?DDR? rowsort @@ -2771,21 +2891,27 @@ explain select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table logical_plan 01)Inner Join: t1.c1 = t2.c1 02)--SubqueryAlias: t1 -03)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] -04)--SubqueryAlias: t2 -05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] +03)----Filter: hashjoin_datatype_table_t1.c1 IS NOT NULL +04)------TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] +05)--SubqueryAlias: t2 +06)----Filter: hashjoin_datatype_table_t2.c1 IS NOT NULL +07)------TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] physical_plan 01)SortMergeJoin: join_type=Inner, on=[(c1@0, c1@0)] 02)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=2 04)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 -05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -06)----------MemoryExec: partitions=1, partition_sizes=[1] -07)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true] -08)----CoalesceBatchesExec: target_batch_size=2 -09)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 -10)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -11)----------MemoryExec: partitions=1, partition_sizes=[1] +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: c1@0 IS NOT NULL +07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +08)--------------MemoryExec: partitions=1, partition_sizes=[1] +09)--SortExec: expr=[c1@0 ASC], preserve_partitioning=[true] +10)----CoalesceBatchesExec: target_batch_size=2 +11)------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 +12)--------CoalesceBatchesExec: target_batch_size=2 +13)----------FilterExec: c1@0 IS NOT NULL +14)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +15)--------------MemoryExec: partitions=1, partition_sizes=[1] # sort_merge_join_on_date32 inner sort merge join on data type (Date32) query DDR?DDR? rowsort @@ -2801,9 +2927,10 @@ explain select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype logical_plan 01)Right Join: CAST(t1.c3 AS Decimal128(10, 2)) = t2.c3 02)--SubqueryAlias: t1 -03)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] -04)--SubqueryAlias: t2 -05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] +03)----Filter: CAST(hashjoin_datatype_table_t1.c3 AS Decimal128(10, 2)) IS NOT NULL +04)------TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] +05)--SubqueryAlias: t2 +06)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] physical_plan 01)ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, c1@5 as c1, c2@6 as c2, c3@7 as c3, c4@8 as c4] 02)--SortMergeJoin: join_type=Right, on=[(CAST(t1.c3 AS Decimal128(10, 2))@4, c3@2)] @@ -2811,13 +2938,15 @@ physical_plan 04)------CoalesceBatchesExec: target_batch_size=2 05)--------RepartitionExec: partitioning=Hash([CAST(t1.c3 AS Decimal128(10, 2))@4], 2), input_partitions=2 06)----------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c3@2 as c3, c4@3 as c4, CAST(c3@2 AS Decimal128(10, 2)) as CAST(t1.c3 AS Decimal128(10, 2))] -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)----SortExec: expr=[c3@2 ASC], preserve_partitioning=[true] -10)------CoalesceBatchesExec: target_batch_size=2 -11)--------RepartitionExec: partitioning=Hash([c3@2], 2), input_partitions=2 -12)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -13)------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: CAST(c3@2 AS Decimal128(10, 2)) IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)----SortExec: expr=[c3@2 ASC], preserve_partitioning=[true] +12)------CoalesceBatchesExec: target_batch_size=2 +13)--------RepartitionExec: partitioning=Hash([c3@2], 2), input_partitions=2 +14)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +15)------------MemoryExec: partitions=1, partition_sizes=[1] # sort_merge_join_on_decimal right join on data type (Decimal) query DDR?DDR? rowsort @@ -2874,12 +3003,16 @@ physical_plan 04)------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] 05)--------CoalesceBatchesExec: target_batch_size=2 06)----------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)--------CoalesceBatchesExec: target_batch_size=2 -10)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -11)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -12)--------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: t2_id@0 IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 +13)------------CoalesceBatchesExec: target_batch_size=2 +14)--------------FilterExec: t1_id@0 IS NOT NULL +15)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +16)------------------MemoryExec: partitions=1, partition_sizes=[1] query IT rowsort SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 WHERE t1_id IN (SELECT t2_id FROM left_semi_anti_join_table_t2 t2) ORDER BY t1_id @@ -2915,12 +3048,16 @@ physical_plan 04)------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] 05)--------CoalesceBatchesExec: target_batch_size=2 06)----------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)--------CoalesceBatchesExec: target_batch_size=2 -10)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -11)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -12)--------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: t2_id@0 IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 +13)------------CoalesceBatchesExec: target_batch_size=2 +14)--------------FilterExec: t1_id@0 IS NOT NULL +15)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +16)------------------MemoryExec: partitions=1, partition_sizes=[1] query IT SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 LEFT SEMI JOIN left_semi_anti_join_table_t2 t2 ON (t1_id = t2_id) ORDER BY t1_id @@ -2975,9 +3112,15 @@ physical_plan 02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=2 04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] -05)--------MemoryExec: partitions=1, partition_sizes=[1] -06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)----------MemoryExec: partitions=1, partition_sizes=[1] +05)--------CoalescePartitionsExec +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: t2_id@0 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)--------CoalesceBatchesExec: target_batch_size=2 +11)----------FilterExec: t1_id@0 IS NOT NULL +12)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +13)--------------MemoryExec: partitions=1, partition_sizes=[1] query IT rowsort SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 WHERE t1_id IN (SELECT t2_id FROM left_semi_anti_join_table_t2 t2) ORDER BY t1_id @@ -3011,9 +3154,15 @@ physical_plan 02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=2 04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)] -05)--------MemoryExec: partitions=1, partition_sizes=[1] -06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)----------MemoryExec: partitions=1, partition_sizes=[1] +05)--------CoalescePartitionsExec +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: t2_id@0 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)--------CoalesceBatchesExec: target_batch_size=2 +11)----------FilterExec: t1_id@0 IS NOT NULL +12)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +13)--------------MemoryExec: partitions=1, partition_sizes=[1] query IT SELECT t1_id, t1_name FROM left_semi_anti_join_table_t1 t1 LEFT SEMI JOIN left_semi_anti_join_table_t2 t2 ON (t1_id = t2_id) ORDER BY t1_id @@ -3071,12 +3220,16 @@ physical_plan 04)------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0 05)--------CoalesceBatchesExec: target_batch_size=2 06)----------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)--------CoalesceBatchesExec: target_batch_size=2 -10)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -11)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -12)--------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: t2_id@0 IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 +13)------------CoalesceBatchesExec: target_batch_size=2 +14)--------------FilterExec: t1_id@0 IS NOT NULL +15)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +16)------------------MemoryExec: partitions=1, partition_sizes=[1] query ITI rowsort SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t1 t1 WHERE EXISTS (SELECT * FROM right_semi_anti_join_table_t2 t2 where t2.t2_id = t1.t1_id and t2.t2_name <> t1.t1_name) ORDER BY t1_id @@ -3093,12 +3246,16 @@ physical_plan 04)------HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1 05)--------CoalesceBatchesExec: target_batch_size=2 06)----------RepartitionExec: partitioning=Hash([t2_id@0], 2), input_partitions=2 -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------MemoryExec: partitions=1, partition_sizes=[1] -09)--------CoalesceBatchesExec: target_batch_size=2 -10)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 -11)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -12)--------------MemoryExec: partitions=1, partition_sizes=[1] +07)------------CoalesceBatchesExec: target_batch_size=2 +08)--------------FilterExec: t2_id@0 IS NOT NULL +09)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +10)------------------MemoryExec: partitions=1, partition_sizes=[1] +11)--------CoalesceBatchesExec: target_batch_size=2 +12)----------RepartitionExec: partitioning=Hash([t1_id@0], 2), input_partitions=2 +13)------------CoalesceBatchesExec: target_batch_size=2 +14)--------------FilterExec: t1_id@0 IS NOT NULL +15)----------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +16)------------------MemoryExec: partitions=1, partition_sizes=[1] query ITI rowsort SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t2 t2 RIGHT SEMI JOIN right_semi_anti_join_table_t1 t1 on (t2.t2_id = t1.t1_id and t2.t2_name <> t1.t1_name) ORDER BY t1_id @@ -3151,9 +3308,15 @@ physical_plan 02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=2 04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@1 != t1_name@0 -05)--------MemoryExec: partitions=1, partition_sizes=[1] -06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)----------MemoryExec: partitions=1, partition_sizes=[1] +05)--------CoalescePartitionsExec +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: t2_id@0 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)--------CoalesceBatchesExec: target_batch_size=2 +11)----------FilterExec: t1_id@0 IS NOT NULL +12)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +13)--------------MemoryExec: partitions=1, partition_sizes=[1] query ITI rowsort SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t1 t1 WHERE EXISTS (SELECT * FROM right_semi_anti_join_table_t2 t2 where t2.t2_id = t1.t1_id and t2.t2_name <> t1.t1_name) ORDER BY t1_id @@ -3168,9 +3331,15 @@ physical_plan 02)--SortExec: expr=[t1_id@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=2 04)------HashJoinExec: mode=CollectLeft, join_type=RightSemi, on=[(t2_id@0, t1_id@0)], filter=t2_name@0 != t1_name@1 -05)--------MemoryExec: partitions=1, partition_sizes=[1] -06)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)----------MemoryExec: partitions=1, partition_sizes=[1] +05)--------CoalescePartitionsExec +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: t2_id@0 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------MemoryExec: partitions=1, partition_sizes=[1] +10)--------CoalesceBatchesExec: target_batch_size=2 +11)----------FilterExec: t1_id@0 IS NOT NULL +12)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +13)--------------MemoryExec: partitions=1, partition_sizes=[1] query ITI rowsort SELECT t1_id, t1_name, t1_int FROM right_semi_anti_join_table_t2 t2 RIGHT SEMI JOIN right_semi_anti_join_table_t1 t1 on (t2.t2_id = t1.t1_id and t2.t2_name <> t1.t1_name) ORDER BY t1_id @@ -3240,23 +3409,29 @@ logical_plan 02)--Inner Join: l_table.a = r_table.a 03)----SubqueryAlias: l_table 04)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 -05)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -06)----------TableScan: annotated_data projection=[a0, a, b, c, d] -07)----SubqueryAlias: r_table -08)------TableScan: annotated_data projection=[a0, a, b, c, d] +05)--------Filter: annotated_data.a IS NOT NULL +06)----------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +07)------------TableScan: annotated_data projection=[a0, a, b, c, d] +08)----SubqueryAlias: r_table +09)------Filter: annotated_data.a IS NOT NULL +10)--------TableScan: annotated_data projection=[a0, a, b, c, d], partial_filters=[annotated_data.a IS NOT NULL] physical_plan 01)SortPreservingMergeExec: [rn1@5 ASC NULLS LAST] 02)--SortMergeJoin: join_type=Inner, on=[(a@1, a@1)] 03)----CoalesceBatchesExec: target_batch_size=2 04)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST,rn1@5 ASC NULLS LAST -05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -06)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -07)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] -08)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true -09)----CoalesceBatchesExec: target_batch_size=2 -10)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST -11)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -12)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +05)--------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] +06)----------CoalesceBatchesExec: target_batch_size=2 +07)------------FilterExec: a@1 IS NOT NULL +08)--------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] +10)------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +11)----CoalesceBatchesExec: target_batch_size=2 +12)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST +13)--------CoalesceBatchesExec: target_batch_size=2 +14)----------FilterExec: a@1 IS NOT NULL +15)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +16)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true # sort merge join should propagate ordering equivalence of the right side # for right join. Hence final requirement rn1 ASC is already satisfied at @@ -3273,24 +3448,27 @@ logical_plan 01)Sort: r_table.rn1 ASC NULLS LAST 02)--Right Join: l_table.a = r_table.a 03)----SubqueryAlias: l_table -04)------TableScan: annotated_data projection=[a0, a, b, c, d] -05)----SubqueryAlias: r_table -06)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 -07)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -08)----------TableScan: annotated_data projection=[a0, a, b, c, d] +04)------Filter: annotated_data.a IS NOT NULL +05)--------TableScan: annotated_data projection=[a0, a, b, c, d], partial_filters=[annotated_data.a IS NOT NULL] +06)----SubqueryAlias: r_table +07)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 +08)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +09)----------TableScan: annotated_data projection=[a0, a, b, c, d] physical_plan 01)SortPreservingMergeExec: [rn1@10 ASC NULLS LAST] 02)--SortMergeJoin: join_type=Right, on=[(a@1, a@1)] 03)----CoalesceBatchesExec: target_batch_size=2 04)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST -05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true -07)----CoalesceBatchesExec: target_batch_size=2 -08)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST,rn1@5 ASC NULLS LAST -09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -10)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -11)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] -12)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: a@1 IS NOT NULL +07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +08)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +09)----CoalesceBatchesExec: target_batch_size=2 +10)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST,rn1@5 ASC NULLS LAST +11)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +12)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] +13)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] +14)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true statement ok set datafusion.optimizer.prefer_existing_sort = false; @@ -3313,12 +3491,14 @@ logical_plan 02)--Inner Join: l_table.a = r_table.a 03)----SubqueryAlias: l_table 04)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 -05)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -06)----------TableScan: annotated_data projection=[a0, a, b, c, d] -07)----SubqueryAlias: r_table -08)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 -09)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -10)----------TableScan: annotated_data projection=[a0, a, b, c, d] +05)--------Filter: annotated_data.a IS NOT NULL +06)----------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +07)------------TableScan: annotated_data projection=[a0, a, b, c, d] +08)----SubqueryAlias: r_table +09)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 +10)--------Filter: annotated_data.a IS NOT NULL +11)----------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +12)------------TableScan: annotated_data projection=[a0, a, b, c, d] physical_plan 01)SortPreservingMergeExec: [a@1 ASC,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST,rn1@11 ASC NULLS LAST] 02)--SortExec: expr=[a@1 ASC,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST,rn1@11 ASC NULLS LAST], preserve_partitioning=[true] @@ -3326,17 +3506,21 @@ physical_plan 04)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=2 06)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2 -07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -08)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] -10)------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true -11)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true] -12)--------CoalesceBatchesExec: target_batch_size=2 -13)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2 -14)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -15)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -16)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] -17)------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +07)------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] +08)--------------CoalesceBatchesExec: target_batch_size=2 +09)----------------FilterExec: a@1 IS NOT NULL +10)------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +11)--------------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] +12)----------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +13)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true] +14)--------CoalesceBatchesExec: target_batch_size=2 +15)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2 +16)------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] +17)--------------CoalesceBatchesExec: target_batch_size=2 +18)----------------FilterExec: a@1 IS NOT NULL +19)------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +20)--------------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] +21)----------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true statement ok set datafusion.optimizer.prefer_hash_join = true; @@ -3360,18 +3544,24 @@ logical_plan 01)Sort: r_table.rn1 ASC NULLS LAST 02)--Inner Join: l_table.a = r_table.a 03)----SubqueryAlias: l_table -04)------TableScan: annotated_data projection=[a0, a, b, c, d] -05)----SubqueryAlias: r_table -06)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 -07)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -08)----------TableScan: annotated_data projection=[a0, a, b, c, d] +04)------Filter: annotated_data.a IS NOT NULL +05)--------TableScan: annotated_data projection=[a0, a, b, c, d], partial_filters=[annotated_data.a IS NOT NULL] +06)----SubqueryAlias: r_table +07)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 +08)--------Filter: annotated_data.a IS NOT NULL +09)----------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +10)------------TableScan: annotated_data projection=[a0, a, b, c, d] physical_plan 01)CoalesceBatchesExec: target_batch_size=2 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@1, a@1)] -03)----CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true -04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] -06)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +03)----CoalesceBatchesExec: target_batch_size=2 +04)------FilterExec: a@1 IS NOT NULL +05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +06)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] +07)------CoalesceBatchesExec: target_batch_size=2 +08)--------FilterExec: a@1 IS NOT NULL +09)----------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] +10)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true # hash join should propagate ordering equivalence of the right side for RIGHT ANTI join. # Hence final requirement rn1 ASC is already satisfied at the end of HashJoinExec. @@ -3387,18 +3577,21 @@ logical_plan 01)Sort: r_table.rn1 ASC NULLS LAST 02)--RightAnti Join: l_table.a = r_table.a 03)----SubqueryAlias: l_table -04)------TableScan: annotated_data projection=[a] -05)----SubqueryAlias: r_table -06)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 -07)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -08)----------TableScan: annotated_data projection=[a0, a, b, c, d] +04)------Filter: annotated_data.a IS NOT NULL +05)--------TableScan: annotated_data projection=[a], partial_filters=[annotated_data.a IS NOT NULL] +06)----SubqueryAlias: r_table +07)------Projection: annotated_data.a0, annotated_data.a, annotated_data.b, annotated_data.c, annotated_data.d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS rn1 +08)--------WindowAggr: windowExpr=[[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +09)----------TableScan: annotated_data projection=[a0, a, b, c, d] physical_plan 01)CoalesceBatchesExec: target_batch_size=2 02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(a@0, a@1)] -03)----CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a], output_ordering=[a@0 ASC], has_header=true -04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] -06)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true +03)----CoalesceBatchesExec: target_batch_size=2 +04)------FilterExec: a@0 IS NOT NULL +05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a], output_ordering=[a@0 ASC], has_header=true +06)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] +07)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }], mode=[Sorted] +08)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], has_header=true query TT EXPLAIN SELECT l.a, LAST_VALUE(r.b ORDER BY r.a ASC NULLS FIRST) as last_col1 @@ -3414,16 +3607,22 @@ logical_plan 03)----Aggregate: groupBy=[[l.a, l.b, l.c]], aggr=[[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]]] 04)------Inner Join: l.a = r.a 05)--------SubqueryAlias: l -06)----------TableScan: annotated_data projection=[a, b, c] -07)--------SubqueryAlias: r -08)----------TableScan: annotated_data projection=[a, b] +06)----------Filter: annotated_data.a IS NOT NULL +07)------------TableScan: annotated_data projection=[a, b, c], partial_filters=[annotated_data.a IS NOT NULL] +08)--------SubqueryAlias: r +09)----------Filter: annotated_data.a IS NOT NULL +10)------------TableScan: annotated_data projection=[a, b], partial_filters=[annotated_data.a IS NOT NULL] physical_plan 01)ProjectionExec: expr=[a@0 as a, last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]@3 as last_col1] 02)--AggregateExec: mode=Single, gby=[a@0 as a, b@1 as b, c@2 as c], aggr=[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]], ordering_mode=PartiallySorted([0]) 03)----CoalesceBatchesExec: target_batch_size=2 04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0)] -05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true -06)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], has_header=true +05)--------CoalesceBatchesExec: target_batch_size=2 +06)----------FilterExec: a@0 IS NOT NULL +07)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true +08)--------CoalesceBatchesExec: target_batch_size=2 +09)----------FilterExec: a@0 IS NOT NULL +10)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], has_header=true # create a table where there more than one valid ordering # that describes table. @@ -3503,9 +3702,11 @@ logical_plan 03)----Aggregate: groupBy=[[l.a, l.b, l.c]], aggr=[[last_value(r.b) ORDER BY [r.a ASC NULLS FIRST]]] 04)------Inner Join: l.a = r.a 05)--------SubqueryAlias: l -06)----------TableScan: annotated_data projection=[a, b, c] -07)--------SubqueryAlias: r -08)----------TableScan: annotated_data projection=[a, b] +06)----------Filter: annotated_data.a IS NOT NULL +07)------------TableScan: annotated_data projection=[a, b, c], partial_filters=[annotated_data.a IS NOT NULL] +08)--------SubqueryAlias: r +09)----------Filter: annotated_data.a IS NOT NULL +10)------------TableScan: annotated_data projection=[a, b], partial_filters=[annotated_data.a IS NOT NULL] physical_plan 01)SortPreservingMergeExec: [a@0 ASC] 02)--SortExec: expr=[a@0 ASC], preserve_partitioning=[true] @@ -3518,12 +3719,16 @@ physical_plan 09)----------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a@0)] 10)------------------CoalesceBatchesExec: target_batch_size=2 11)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 -12)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -13)------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true -14)------------------CoalesceBatchesExec: target_batch_size=2 -15)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 -16)----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -17)------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], has_header=true +12)----------------------CoalesceBatchesExec: target_batch_size=2 +13)------------------------FilterExec: a@0 IS NOT NULL +14)--------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +15)----------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true +16)------------------CoalesceBatchesExec: target_batch_size=2 +17)--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 +18)----------------------CoalesceBatchesExec: target_batch_size=2 +19)------------------------FilterExec: a@0 IS NOT NULL +20)--------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +21)----------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], has_header=true query TT EXPLAIN SELECT * @@ -3928,16 +4133,19 @@ EXPLAIN SELECT * FROM ( logical_plan 01)Right Join: lhs.b = rhs.b 02)--SubqueryAlias: lhs -03)----TableScan: left_table_no_nulls projection=[a, b] -04)--SubqueryAlias: rhs -05)----Sort: right_table_no_nulls.b ASC NULLS LAST, fetch=10 -06)------TableScan: right_table_no_nulls projection=[a, b] +03)----Filter: left_table_no_nulls.b IS NOT NULL +04)------TableScan: left_table_no_nulls projection=[a, b] +05)--SubqueryAlias: rhs +06)----Sort: right_table_no_nulls.b ASC NULLS LAST, fetch=10 +07)------TableScan: right_table_no_nulls projection=[a, b] physical_plan 01)CoalesceBatchesExec: target_batch_size=3 02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(b@1, b@1)] -03)----MemoryExec: partitions=1, partition_sizes=[1] -04)----SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false] -05)------MemoryExec: partitions=1, partition_sizes=[1] +03)----CoalesceBatchesExec: target_batch_size=3 +04)------FilterExec: b@1 IS NOT NULL +05)--------MemoryExec: partitions=1, partition_sizes=[1] +06)----SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false] +07)------MemoryExec: partitions=1, partition_sizes=[1] @@ -3986,14 +4194,17 @@ EXPLAIN SELECT * FROM ( logical_plan 01)Right Join: lhs.b = rhs.b 02)--SubqueryAlias: lhs -03)----TableScan: left_table_no_nulls projection=[a, b] -04)--SubqueryAlias: rhs -05)----TableScan: right_table_no_nulls projection=[a, b] +03)----Filter: left_table_no_nulls.b IS NOT NULL +04)------TableScan: left_table_no_nulls projection=[a, b] +05)--SubqueryAlias: rhs +06)----TableScan: right_table_no_nulls projection=[a, b] physical_plan 01)CoalesceBatchesExec: target_batch_size=3 02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(b@1, b@1)] -03)----MemoryExec: partitions=1, partition_sizes=[1] -04)----MemoryExec: partitions=1, partition_sizes=[1] +03)----CoalesceBatchesExec: target_batch_size=3 +04)------FilterExec: b@1 IS NOT NULL +05)--------MemoryExec: partitions=1, partition_sizes=[1] +06)----MemoryExec: partitions=1, partition_sizes=[1] # Null build indices: @@ -4044,16 +4255,19 @@ EXPLAIN SELECT * FROM ( logical_plan 01)Right Join: lhs.b = rhs.b 02)--SubqueryAlias: lhs -03)----TableScan: left_table_no_nulls projection=[a, b] -04)--SubqueryAlias: rhs -05)----Sort: right_table_no_nulls.b ASC NULLS LAST, fetch=10 -06)------TableScan: right_table_no_nulls projection=[a, b] +03)----Filter: left_table_no_nulls.b IS NOT NULL +04)------TableScan: left_table_no_nulls projection=[a, b] +05)--SubqueryAlias: rhs +06)----Sort: right_table_no_nulls.b ASC NULLS LAST, fetch=10 +07)------TableScan: right_table_no_nulls projection=[a, b] physical_plan 01)CoalesceBatchesExec: target_batch_size=3 02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(b@1, b@1)] -03)----MemoryExec: partitions=1, partition_sizes=[1] -04)----SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false] -05)------MemoryExec: partitions=1, partition_sizes=[1] +03)----CoalesceBatchesExec: target_batch_size=3 +04)------FilterExec: b@1 IS NOT NULL +05)--------MemoryExec: partitions=1, partition_sizes=[1] +06)----SortExec: TopK(fetch=10), expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false] +07)------MemoryExec: partitions=1, partition_sizes=[1] # Test CROSS JOIN LATERAL syntax (planning) diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 493db62286b6..594365eaa409 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -357,17 +357,20 @@ logical_plan 01)Sort: customer.c_custkey ASC NULLS LAST 02)--Projection: customer.c_custkey 03)----Inner Join: customer.c_custkey = __scalar_sq_1.o_custkey Filter: CAST(customer.c_acctbal AS Decimal128(25, 2)) < __scalar_sq_1.sum(orders.o_totalprice) -04)------TableScan: customer projection=[c_custkey, c_acctbal] -05)------SubqueryAlias: __scalar_sq_1 -06)--------Projection: sum(orders.o_totalprice), orders.o_custkey -07)----------Aggregate: groupBy=[[orders.o_custkey]], aggr=[[sum(orders.o_totalprice)]] -08)------------Projection: orders.o_custkey, orders.o_totalprice -09)--------------Inner Join: orders.o_orderkey = __scalar_sq_2.l_orderkey Filter: CAST(orders.o_totalprice AS Decimal128(25, 2)) < __scalar_sq_2.price -10)----------------TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice] -11)----------------SubqueryAlias: __scalar_sq_2 -12)------------------Projection: sum(lineitem.l_extendedprice) AS price, lineitem.l_orderkey -13)--------------------Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[sum(lineitem.l_extendedprice)]] -14)----------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice] +04)------Filter: customer.c_custkey IS NOT NULL +05)--------TableScan: customer projection=[c_custkey, c_acctbal], partial_filters=[customer.c_custkey IS NOT NULL] +06)------SubqueryAlias: __scalar_sq_1 +07)--------Projection: sum(orders.o_totalprice), orders.o_custkey +08)----------Aggregate: groupBy=[[orders.o_custkey]], aggr=[[sum(orders.o_totalprice)]] +09)------------Projection: orders.o_custkey, orders.o_totalprice +10)--------------Inner Join: orders.o_orderkey = __scalar_sq_2.l_orderkey Filter: CAST(orders.o_totalprice AS Decimal128(25, 2)) < __scalar_sq_2.price +11)----------------Filter: orders.o_custkey IS NOT NULL AND orders.o_orderkey IS NOT NULL +12)------------------TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice], partial_filters=[orders.o_custkey IS NOT NULL, orders.o_orderkey IS NOT NULL] +13)----------------SubqueryAlias: __scalar_sq_2 +14)------------------Projection: sum(lineitem.l_extendedprice) AS price, lineitem.l_orderkey +15)--------------------Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[sum(lineitem.l_extendedprice)]] +16)----------------------Filter: lineitem.l_orderkey IS NOT NULL +17)------------------------TableScan: lineitem projection=[l_orderkey, l_extendedprice], partial_filters=[lineitem.l_orderkey IS NOT NULL] # correlated_where_in query TT @@ -379,10 +382,12 @@ where o_orderstatus in ( logical_plan 01)Projection: orders.o_orderkey 02)--LeftSemi Join: orders.o_orderstatus = __correlated_sq_1.l_linestatus, orders.o_orderkey = __correlated_sq_1.l_orderkey -03)----TableScan: orders projection=[o_orderkey, o_orderstatus] -04)----SubqueryAlias: __correlated_sq_1 -05)------Projection: lineitem.l_linestatus, lineitem.l_orderkey -06)--------TableScan: lineitem projection=[l_orderkey, l_linestatus] +03)----Filter: orders.o_orderstatus IS NOT NULL AND orders.o_orderkey IS NOT NULL +04)------TableScan: orders projection=[o_orderkey, o_orderstatus], partial_filters=[orders.o_orderstatus IS NOT NULL, orders.o_orderkey IS NOT NULL] +05)----SubqueryAlias: __correlated_sq_1 +06)------Projection: lineitem.l_linestatus, lineitem.l_orderkey +07)--------Filter: lineitem.l_linestatus IS NOT NULL AND lineitem.l_orderkey IS NOT NULL +08)----------TableScan: lineitem projection=[l_orderkey, l_linestatus], partial_filters=[lineitem.l_linestatus IS NOT NULL, lineitem.l_orderkey IS NOT NULL] query I rowsort select o_orderkey from orders @@ -416,11 +421,12 @@ explain SELECT t1_id, t1_name, t1_int FROM t1 WHERE t1_id IN(SELECT t1_int FROM ---- logical_plan 01)LeftSemi Join: t1.t1_id = __correlated_sq_1.t1_int -02)--TableScan: t1 projection=[t1_id, t1_name, t1_int] -03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: t1.t1_int -05)------Filter: t1.t1_int < t1.t1_id -06)--------TableScan: t1 projection=[t1_id, t1_int] +02)--Filter: t1.t1_id IS NOT NULL +03)----TableScan: t1 projection=[t1_id, t1_name, t1_int] +04)--SubqueryAlias: __correlated_sq_1 +05)----Projection: t1.t1_int +06)------Filter: t1.t1_int IS NOT NULL AND t1.t1_int < t1.t1_id +07)--------TableScan: t1 projection=[t1_id, t1_int] #in_subquery_nested_exist_subquery query TT @@ -428,13 +434,15 @@ explain SELECT t1_id, t1_name, t1_int FROM t1 WHERE t1_id IN(SELECT t2_id FROM t ---- logical_plan 01)LeftSemi Join: t1.t1_id = __correlated_sq_1.t2_id -02)--TableScan: t1 projection=[t1_id, t1_name, t1_int] -03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: t2.t2_id -05)------LeftSemi Join: Filter: __correlated_sq_2.t1_int > t2.t2_int -06)--------TableScan: t2 projection=[t2_id, t2_int] -07)--------SubqueryAlias: __correlated_sq_2 -08)----------TableScan: t1 projection=[t1_int] +02)--Filter: t1.t1_id IS NOT NULL +03)----TableScan: t1 projection=[t1_id, t1_name, t1_int] +04)--SubqueryAlias: __correlated_sq_1 +05)----Projection: t2.t2_id +06)------LeftSemi Join: Filter: __correlated_sq_2.t1_int > t2.t2_int +07)--------Filter: t2.t2_id IS NOT NULL +08)----------TableScan: t2 projection=[t2_id, t2_int] +09)--------SubqueryAlias: __correlated_sq_2 +10)----------TableScan: t1 projection=[t1_int] #invalid_scalar_subquery statement error DataFusion error: check_analyzed_plan\ncaused by\nError during planning: Scalar subquery should only return one column, but found 2: t2.t2_id, t2.t2_name @@ -568,13 +576,16 @@ explain SELECT t0_id, t0_name FROM t0 WHERE EXISTS (SELECT 1 FROM t1 INNER JOIN ---- logical_plan 01)LeftSemi Join: t0.t0_name = __correlated_sq_1.t2_name -02)--TableScan: t0 projection=[t0_id, t0_name] -03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: t2.t2_name -05)------Inner Join: t1.t1_id = t2.t2_id -06)--------TableScan: t1 projection=[t1_id] -07)--------SubqueryAlias: t2 -08)----------TableScan: t2 projection=[t2_id, t2_name] +02)--Filter: t0.t0_name IS NOT NULL +03)----TableScan: t0 projection=[t0_id, t0_name] +04)--SubqueryAlias: __correlated_sq_1 +05)----Projection: t2.t2_name +06)------Inner Join: t1.t1_id = t2.t2_id +07)--------Filter: t1.t1_id IS NOT NULL +08)----------TableScan: t1 projection=[t1_id] +09)--------SubqueryAlias: t2 +10)----------Filter: t2.t2_name IS NOT NULL AND t2.t2_id IS NOT NULL +11)------------TableScan: t2 projection=[t2_id, t2_name] #subquery_contains_join_contains_sub_query_alias_correlated_columns query TT @@ -582,14 +593,17 @@ explain SELECT t0_id, t0_name FROM t0 WHERE EXISTS (select 1 from (SELECT * FROM ---- logical_plan 01)LeftSemi Join: t0.t0_id = __correlated_sq_1.t1_id, t0.t0_name = __correlated_sq_1.t2_name -02)--TableScan: t0 projection=[t0_id, t0_name] -03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: x.t1_id, y.t2_name -05)------Inner Join: x.t1_id = y.t2_id -06)--------SubqueryAlias: x -07)----------TableScan: t1 projection=[t1_id] -08)--------SubqueryAlias: y -09)----------TableScan: t2 projection=[t2_id, t2_name] +02)--Filter: t0.t0_id IS NOT NULL AND t0.t0_name IS NOT NULL +03)----TableScan: t0 projection=[t0_id, t0_name] +04)--SubqueryAlias: __correlated_sq_1 +05)----Projection: x.t1_id, y.t2_name +06)------Inner Join: x.t1_id = y.t2_id +07)--------SubqueryAlias: x +08)----------Filter: t1.t1_id IS NOT NULL +09)------------TableScan: t1 projection=[t1_id] +10)--------SubqueryAlias: y +11)----------Filter: t2.t2_name IS NOT NULL AND t2.t2_id IS NOT NULL +12)------------TableScan: t2 projection=[t2_id, t2_name] #support_order_by_correlated_columns query TT @@ -622,9 +636,11 @@ explain SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id ---- logical_plan 01)LeftSemi Join: t1.t1_id = __correlated_sq_1.t2_id -02)--TableScan: t1 projection=[t1_id, t1_name] -03)--SubqueryAlias: __correlated_sq_1 -04)----TableScan: t2 projection=[t2_id] +02)--Filter: t1.t1_id IS NOT NULL +03)----TableScan: t1 projection=[t1_id, t1_name] +04)--SubqueryAlias: __correlated_sq_1 +05)----Filter: t2.t2_id IS NOT NULL +06)------TableScan: t2 projection=[t2_id] query IT rowsort SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 1) @@ -681,10 +697,12 @@ explain SELECT t1_id, t1_name FROM t1 WHERE t1_id in (SELECT t2_id FROM t2 limit ---- logical_plan 01)LeftSemi Join: t1.t1_id = __correlated_sq_1.t2_id -02)--TableScan: t1 projection=[t1_id, t1_name] -03)--SubqueryAlias: __correlated_sq_1 -04)----Limit: skip=0, fetch=10 -05)------TableScan: t2 projection=[t2_id], fetch=10 +02)--Filter: t1.t1_id IS NOT NULL +03)----TableScan: t1 projection=[t1_id, t1_name] +04)--SubqueryAlias: __correlated_sq_1 +05)----Filter: t2.t2_id IS NOT NULL +06)------Limit: skip=0, fetch=10 +07)--------TableScan: t2 projection=[t2_id], fetch=10 #uncorrelated_scalar_subquery_with_limit0 @@ -787,7 +805,9 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*), t2.t2_int, Boolean(true) AS __always_true 06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -07)----------TableScan: t2 projection=[t2_int] +07)----------Filter: t2.t2_int IS NOT NULL +08)------------TableScan: t2 projection=[t2_int] + query II rowsort SELECT t1_id, (SELECT count(*) FROM t2 WHERE t2.t2_int = t1.t1_int) from t1 @@ -809,7 +829,8 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*), t2.t2_int, Boolean(true) AS __always_true 06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -07)----------TableScan: t2 projection=[t2_int] +07)----------Filter: t2.t2_int IS NOT NULL +08)------------TableScan: t2 projection=[t2_int] query II rowsort SELECT t1_id, (SELECT count(*) FROM t2 WHERE t2.t2_int = t1.t1_int) as cnt from t1 @@ -830,7 +851,8 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*) AS _cnt, t2.t2_int, Boolean(true) AS __always_true 06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -07)----------TableScan: t2 projection=[t2_int] +07)----------Filter: t2.t2_int IS NOT NULL +08)------------TableScan: t2 projection=[t2_int] query II rowsort SELECT t1_id, (SELECT count(*) as _cnt FROM t2 WHERE t2.t2_int = t1.t1_int) as cnt from t1 @@ -851,7 +873,8 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*) + Int64(2) AS _cnt, t2.t2_int, Boolean(true) AS __always_true 06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -07)----------TableScan: t2 projection=[t2_int] +07)----------Filter: t2.t2_int IS NOT NULL +08)------------TableScan: t2 projection=[t2_int] query II rowsort SELECT t1_id, (SELECT count(*) + 2 as _cnt FROM t2 WHERE t2.t2_int = t1.t1_int) from t1 @@ -874,7 +897,8 @@ logical_plan 06)--------SubqueryAlias: __scalar_sq_1 07)----------Projection: count(*), t2.t2_id, Boolean(true) AS __always_true 08)------------Aggregate: groupBy=[[t2.t2_id]], aggr=[[count(Int64(1)) AS count(*)]] -09)--------------TableScan: t2 projection=[t2_id] +09)--------------Filter: t2.t2_id IS NOT NULL +10)----------------TableScan: t2 projection=[t2_id] query I rowsort select t1.t1_int from t1 where (select count(*) from t2 where t1.t1_id = t2.t2_id) < t1.t1_int @@ -896,7 +920,8 @@ logical_plan 05)------Projection: count(*) + Int64(2) AS cnt_plus_2, t2.t2_int 06)--------Filter: count(*) > Int64(1) 07)----------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -08)------------TableScan: t2 projection=[t2_int] +08)------------Filter: t2.t2_int IS NOT NULL +09)--------------TableScan: t2 projection=[t2_int] query II rowsort SELECT t1_id, (SELECT count(*) + 2 as cnt_plus_2 FROM t2 WHERE t2.t2_int = t1.t1_int having count(*) >1) from t1 @@ -918,7 +943,8 @@ logical_plan 04)----SubqueryAlias: __scalar_sq_1 05)------Projection: count(*) + Int64(2) AS cnt_plus_2, t2.t2_int, count(*), Boolean(true) AS __always_true 06)--------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -07)----------TableScan: t2 projection=[t2_int] +07)----------Filter: t2.t2_int IS NOT NULL +08)------------TableScan: t2 projection=[t2_int] query II rowsort SELECT t1_id, (SELECT count(*) + 2 as cnt_plus_2 FROM t2 WHERE t2.t2_int = t1.t1_int having count(*) = 0) from t1 @@ -942,7 +968,8 @@ logical_plan 07)--------SubqueryAlias: __scalar_sq_1 08)----------Projection: count(*), t2.t2_int, Boolean(true) AS __always_true 09)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -10)--------------TableScan: t2 projection=[t2_int] +10)--------------Filter: t2.t2_int IS NOT NULL +11)----------------TableScan: t2 projection=[t2_int] query I rowsort select t1.t1_int from t1 group by t1.t1_int having (select count(*) from t2 where t1.t1_int = t2.t2_int) = 0 @@ -963,7 +990,8 @@ logical_plan 06)--------SubqueryAlias: __scalar_sq_1 07)----------Projection: count(*) AS cnt, t2.t2_int, Boolean(true) AS __always_true 08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -09)--------------TableScan: t2 projection=[t2_int] +09)--------------Filter: t2.t2_int IS NOT NULL +10)----------------TableScan: t2 projection=[t2_int] query I rowsort @@ -993,7 +1021,8 @@ logical_plan 06)--------SubqueryAlias: __scalar_sq_1 07)----------Projection: count(*) + Int64(1) + Int64(1) AS cnt_plus_two, t2.t2_int, count(*), Boolean(true) AS __always_true 08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -09)--------------TableScan: t2 projection=[t2_int] +09)--------------Filter: t2.t2_int IS NOT NULL +10)----------------TableScan: t2 projection=[t2_int] query I rowsort select t1.t1_int from t1 where ( @@ -1022,7 +1051,8 @@ logical_plan 06)--------SubqueryAlias: __scalar_sq_1 07)----------Projection: CASE WHEN count(*) = Int64(1) THEN Int64(NULL) ELSE count(*) END AS cnt, t2.t2_int, Boolean(true) AS __always_true 08)------------Aggregate: groupBy=[[t2.t2_int]], aggr=[[count(Int64(1)) AS count(*)]] -09)--------------TableScan: t2 projection=[t2_int] +09)--------------Filter: t2.t2_int IS NOT NULL +10)----------------TableScan: t2 projection=[t2_int] query I rowsort diff --git a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part index 92507aaf947f..a4a210382737 100644 --- a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part @@ -30,7 +30,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS supplier ( s_phone VARCHAR NOT NULL, s_acctbal DECIMAL(15, 2) NOT NULL, s_comment VARCHAR NOT NULL, - NOT NULL VARCHAR NOT NULL, + s_rev VARCHAR NOT NULL, ) STORED AS CSV LOCATION 'test_files/tpch/data/supplier.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok From 4427a9f2b0070ebc57487e603ffea11e502984d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 21:13:19 +0200 Subject: [PATCH 19/22] Wip --- .../tests/cases/roundtrip_logical_plan.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index cc353ab36d97..ed8ba55e802d 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -491,8 +491,10 @@ async fn roundtrip_exists_filter() -> Result<()> { "SELECT b FROM data d1 WHERE EXISTS (SELECT * FROM data2 d2 WHERE d2.a = d1.a AND d2.e != d1.e)", "Projection: data.b\ \n LeftSemi Join: data.a = data2.a Filter: data2.e != CAST(data.e AS Int64)\ - \n TableScan: data projection=[a, b, e]\ - \n TableScan: data2 projection=[a, e]", + \n Filter: data.a IS NOT NULL\ + \n TableScan: data projection=[a, b, e], partial_filters=[data.a IS NOT NULL]\ + \n Filter: data2.a IS NOT NULL\ + \n TableScan: data2 projection=[a, e], partial_filters=[data2.a IS NOT NULL]", false // "d1" vs "data" field qualifier ).await } @@ -502,9 +504,11 @@ async fn inner_join() -> Result<()> { assert_expected_plan( "SELECT data.a FROM data JOIN data2 ON data.a = data2.a", "Projection: data.a\ - \n Inner Join: data.a = data2.a\ - \n TableScan: data projection=[a]\ - \n TableScan: data2 projection=[a]", + \n Inner Join: data.a = data2.a\ + \n Filter: data.a IS NOT NULL\ + \n TableScan: data projection=[a], partial_filters=[data.a IS NOT NULL]\ + \n Filter: data2.a IS NOT NULL\ + \n TableScan: data2 projection=[a], partial_filters=[data2.a IS NOT NULL]", true, ) .await From 3645f41cdc713d08ad9c8c41bfdd7142df6022bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 21:15:08 +0200 Subject: [PATCH 20/22] Wipc --- .../sqllogictest/test_files/tpch/q15.slt.part | 71 ++++++++++--------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/datafusion/sqllogictest/test_files/tpch/q15.slt.part b/datafusion/sqllogictest/test_files/tpch/q15.slt.part index 2374fd8430a4..83217ac86b71 100644 --- a/datafusion/sqllogictest/test_files/tpch/q15.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q15.slt.part @@ -58,18 +58,20 @@ logical_plan 06)----------TableScan: supplier projection=[s_suppkey, s_name, s_address, s_phone] 07)----------SubqueryAlias: revenue0 08)------------Projection: lineitem.l_suppkey AS supplier_no, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue -09)--------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] -10)----------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount -11)------------------Filter: lineitem.l_shipdate >= Date32("1996-01-01") AND lineitem.l_shipdate < Date32("1996-04-01") -12)--------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1996-01-01"), lineitem.l_shipdate < Date32("1996-04-01")] -13)------SubqueryAlias: __scalar_sq_1 -14)--------Aggregate: groupBy=[[]], aggr=[[max(revenue0.total_revenue)]] -15)----------SubqueryAlias: revenue0 -16)------------Projection: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue -17)--------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] -18)----------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount -19)------------------Filter: lineitem.l_shipdate >= Date32("1996-01-01") AND lineitem.l_shipdate < Date32("1996-04-01") -20)--------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1996-01-01"), lineitem.l_shipdate < Date32("1996-04-01")] +09)--------------Filter: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) IS NOT NULL +10)----------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] +11)------------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount +12)--------------------Filter: lineitem.l_shipdate >= Date32("1996-01-01") AND lineitem.l_shipdate < Date32("1996-04-01") +13)----------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1996-01-01"), lineitem.l_shipdate < Date32("1996-04-01")] +14)------SubqueryAlias: __scalar_sq_1 +15)--------Filter: max(revenue0.total_revenue) IS NOT NULL +16)----------Aggregate: groupBy=[[]], aggr=[[max(revenue0.total_revenue)]] +17)------------SubqueryAlias: revenue0 +18)--------------Projection: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue +19)----------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] +20)------------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount +21)--------------------Filter: lineitem.l_shipdate >= Date32("1996-01-01") AND lineitem.l_shipdate < Date32("1996-04-01") +22)----------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1996-01-01"), lineitem.l_shipdate < Date32("1996-04-01")] physical_plan 01)SortPreservingMergeExec: [s_suppkey@0 ASC NULLS LAST] 02)--SortExec: expr=[s_suppkey@0 ASC NULLS LAST], preserve_partitioning=[true] @@ -84,26 +86,31 @@ physical_plan 11)--------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 12)----------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_name, s_address, s_phone], has_header=false 13)----------------ProjectionExec: expr=[l_suppkey@0 as supplier_no, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue] -14)------------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -15)--------------------CoalesceBatchesExec: target_batch_size=8192 -16)----------------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4 -17)------------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -18)--------------------------CoalesceBatchesExec: target_batch_size=8192 -19)----------------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2] -20)------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], has_header=false -21)--------CoalesceBatchesExec: target_batch_size=8192 -22)----------RepartitionExec: partitioning=Hash([max(revenue0.total_revenue)@0], 4), input_partitions=1 -23)------------AggregateExec: mode=Final, gby=[], aggr=[max(revenue0.total_revenue)] -24)--------------CoalescePartitionsExec -25)----------------AggregateExec: mode=Partial, gby=[], aggr=[max(revenue0.total_revenue)] -26)------------------ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue] -27)--------------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -28)----------------------CoalesceBatchesExec: target_batch_size=8192 -29)------------------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4 -30)--------------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -31)----------------------------CoalesceBatchesExec: target_batch_size=8192 -32)------------------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2] -33)--------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], has_header=false +14)------------------CoalesceBatchesExec: target_batch_size=8192 +15)--------------------FilterExec: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 IS NOT NULL +16)----------------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +17)------------------------CoalesceBatchesExec: target_batch_size=8192 +18)--------------------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4 +19)----------------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +20)------------------------------CoalesceBatchesExec: target_batch_size=8192 +21)--------------------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2] +22)----------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], has_header=false +23)--------CoalesceBatchesExec: target_batch_size=8192 +24)----------RepartitionExec: partitioning=Hash([max(revenue0.total_revenue)@0], 4), input_partitions=4 +25)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +26)--------------CoalesceBatchesExec: target_batch_size=8192 +27)----------------FilterExec: max(revenue0.total_revenue)@0 IS NOT NULL +28)------------------AggregateExec: mode=Final, gby=[], aggr=[max(revenue0.total_revenue)] +29)--------------------CoalescePartitionsExec +30)----------------------AggregateExec: mode=Partial, gby=[], aggr=[max(revenue0.total_revenue)] +31)------------------------ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue] +32)--------------------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +33)----------------------------CoalesceBatchesExec: target_batch_size=8192 +34)------------------------------RepartitionExec: partitioning=Hash([l_suppkey@0], 4), input_partitions=4 +35)--------------------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +36)----------------------------------CoalesceBatchesExec: target_batch_size=8192 +37)------------------------------------FilterExec: l_shipdate@3 >= 1996-01-01 AND l_shipdate@3 < 1996-04-01, projection=[l_suppkey@0, l_extendedprice@1, l_discount@2] +38)--------------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], has_header=false query ITTTR with revenue0 (supplier_no, total_revenue) as ( From 54b344e188cfc5acd804d067087dfae6100359bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Sep 2024 21:38:21 +0200 Subject: [PATCH 21/22] Wip --- .../sqllogictest/test_files/tpch/q2.slt.part | 79 ++++++++++--------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/datafusion/sqllogictest/test_files/tpch/q2.slt.part b/datafusion/sqllogictest/test_files/tpch/q2.slt.part index 17f3b78a089d..6496f453fe66 100644 --- a/datafusion/sqllogictest/test_files/tpch/q2.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q2.slt.part @@ -85,19 +85,20 @@ logical_plan 20)--------------TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("EUROPE")] 21)------SubqueryAlias: __scalar_sq_1 22)--------Projection: min(partsupp.ps_supplycost), partsupp.ps_partkey -23)----------Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[min(partsupp.ps_supplycost)]] -24)------------Projection: partsupp.ps_partkey, partsupp.ps_supplycost -25)--------------Inner Join: nation.n_regionkey = region.r_regionkey -26)----------------Projection: partsupp.ps_partkey, partsupp.ps_supplycost, nation.n_regionkey -27)------------------Inner Join: supplier.s_nationkey = nation.n_nationkey -28)--------------------Projection: partsupp.ps_partkey, partsupp.ps_supplycost, supplier.s_nationkey -29)----------------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey -30)------------------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] -31)------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] -32)--------------------TableScan: nation projection=[n_nationkey, n_regionkey] -33)----------------Projection: region.r_regionkey -34)------------------Filter: region.r_name = Utf8("EUROPE") -35)--------------------TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("EUROPE")] +23)----------Filter: min(partsupp.ps_supplycost) IS NOT NULL +24)------------Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[min(partsupp.ps_supplycost)]] +25)--------------Projection: partsupp.ps_partkey, partsupp.ps_supplycost +26)----------------Inner Join: nation.n_regionkey = region.r_regionkey +27)------------------Projection: partsupp.ps_partkey, partsupp.ps_supplycost, nation.n_regionkey +28)--------------------Inner Join: supplier.s_nationkey = nation.n_nationkey +29)----------------------Projection: partsupp.ps_partkey, partsupp.ps_supplycost, supplier.s_nationkey +30)------------------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey +31)--------------------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] +32)--------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] +33)----------------------TableScan: nation projection=[n_nationkey, n_regionkey] +34)------------------Projection: region.r_regionkey +35)--------------------Filter: region.r_name = Utf8("EUROPE") +36)----------------------TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("EUROPE")] physical_plan 01)SortPreservingMergeExec: [s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST], fetch=10 02)--SortExec: TopK(fetch=10), expr=[s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST], preserve_partitioning=[true] @@ -147,37 +148,39 @@ physical_plan 46)----------CoalesceBatchesExec: target_batch_size=8192 47)------------RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 4), input_partitions=4 48)--------------ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] -49)----------------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] -50)------------------CoalesceBatchesExec: target_batch_size=8192 -51)--------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 -52)----------------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] -53)------------------------CoalesceBatchesExec: target_batch_size=8192 -54)--------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@2, r_regionkey@0)], projection=[ps_partkey@0, ps_supplycost@1] +49)----------------CoalesceBatchesExec: target_batch_size=8192 +50)------------------FilterExec: min(partsupp.ps_supplycost)@1 IS NOT NULL +51)--------------------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] +52)----------------------CoalesceBatchesExec: target_batch_size=8192 +53)------------------------RepartitionExec: partitioning=Hash([ps_partkey@0], 4), input_partitions=4 +54)--------------------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] 55)----------------------------CoalesceBatchesExec: target_batch_size=8192 -56)------------------------------RepartitionExec: partitioning=Hash([n_regionkey@2], 4), input_partitions=4 +56)------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@2, r_regionkey@0)], projection=[ps_partkey@0, ps_supplycost@1] 57)--------------------------------CoalesceBatchesExec: target_batch_size=8192 -58)----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_partkey@0, ps_supplycost@1, n_regionkey@4] +58)----------------------------------RepartitionExec: partitioning=Hash([n_regionkey@2], 4), input_partitions=4 59)------------------------------------CoalesceBatchesExec: target_batch_size=8192 -60)--------------------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 +60)--------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@2, n_nationkey@0)], projection=[ps_partkey@0, ps_supplycost@1, n_regionkey@4] 61)----------------------------------------CoalesceBatchesExec: target_batch_size=8192 -62)------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_supplycost@2, s_nationkey@4] +62)------------------------------------------RepartitionExec: partitioning=Hash([s_nationkey@2], 4), input_partitions=4 63)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192 -64)----------------------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4 -65)------------------------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], has_header=false -66)--------------------------------------------CoalesceBatchesExec: target_batch_size=8192 -67)----------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4 -68)------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -69)--------------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], has_header=false -70)------------------------------------CoalesceBatchesExec: target_batch_size=8192 -71)--------------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 -72)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -73)------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], has_header=false -74)----------------------------CoalesceBatchesExec: target_batch_size=8192 -75)------------------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 +64)----------------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, s_suppkey@0)], projection=[ps_partkey@0, ps_supplycost@2, s_nationkey@4] +65)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192 +66)--------------------------------------------------RepartitionExec: partitioning=Hash([ps_suppkey@1], 4), input_partitions=4 +67)----------------------------------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:0..2932049], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:2932049..5864098], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:5864098..8796147], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/partsupp.tbl:8796147..11728193]]}, projection=[ps_partkey, ps_suppkey, ps_supplycost], has_header=false +68)------------------------------------------------CoalesceBatchesExec: target_batch_size=8192 +69)--------------------------------------------------RepartitionExec: partitioning=Hash([s_suppkey@0], 4), input_partitions=4 +70)----------------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +71)------------------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], has_header=false +72)----------------------------------------CoalesceBatchesExec: target_batch_size=8192 +73)------------------------------------------RepartitionExec: partitioning=Hash([n_nationkey@0], 4), input_partitions=4 +74)--------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +75)----------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_regionkey], has_header=false 76)--------------------------------CoalesceBatchesExec: target_batch_size=8192 -77)----------------------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0] -78)------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -79)--------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], has_header=false +77)----------------------------------RepartitionExec: partitioning=Hash([r_regionkey@0], 4), input_partitions=4 +78)------------------------------------CoalesceBatchesExec: target_batch_size=8192 +79)--------------------------------------FilterExec: r_name@1 = EUROPE, projection=[r_regionkey@0] +80)----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +81)------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], has_header=false From f56293c65ac5e284bf3a0c42ac8741579ece5aa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 9 Sep 2024 20:35:17 +0200 Subject: [PATCH 22/22] Wip bench --- datafusion/core/benches/sql_query_with_io.rs | 40 +++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/datafusion/core/benches/sql_query_with_io.rs b/datafusion/core/benches/sql_query_with_io.rs index aef39a04e47e..e6eb95001374 100644 --- a/datafusion/core/benches/sql_query_with_io.rs +++ b/datafusion/core/benches/sql_query_with_io.rs @@ -42,7 +42,7 @@ use rand::{rngs::StdRng, Rng, SeedableRng}; use tokio::runtime::Runtime; use url::Url; -const THREADS: usize = 4; +const THREADS: usize = 10; const TABLES: usize = 3; const TABLE_PARTITIONS: usize = 10; const PARTITION_FILES: usize = 2; @@ -58,6 +58,7 @@ fn schema() -> SchemaRef { Arc::new(Schema::new(vec![ Field::new("id", DataType::UInt64, false), Field::new("payload", DataType::Int64, false), + Field::new("optional_id", DataType::UInt64, true), ])) } @@ -65,15 +66,23 @@ fn create_parquet_file(rng: &mut StdRng, id_offset: usize) -> Bytes { let schema = schema(); let mut id_builder = UInt64Builder::new(); let mut payload_builder = Int64Builder::new(); + let mut optional_id_builder = UInt64Builder::new(); + for row in 0..FILE_ROWS { id_builder.append_value((row + id_offset) as u64); payload_builder.append_value(rng.gen()); + if row % 2 == 0 { + optional_id_builder.append_null(); + } else { + optional_id_builder.append_value((row + id_offset) as u64); + } } let batch = RecordBatch::try_new( Arc::clone(&schema), vec![ Arc::new(id_builder.finish()), Arc::new(payload_builder.finish()), + Arc::new(optional_id_builder.finish()), ], ) .unwrap(); @@ -256,6 +265,35 @@ fn criterion_benchmark(c: &mut Criterion) { &format!("{join_query} WHERE {table0_name}.partition = 0"), PARTITION_FILES * FILE_ROWS, ); + + let mut join_query = "SELECT * FROM".to_owned(); + for table_id in 0..TABLES { + let table_name = table_name(table_id); + if table_id == 0 { + write!(join_query, " {table_name}").unwrap(); + } else { + write!( + join_query, + " INNER JOIN {table_name} on {table_name}.optional_id = {table0_name}.id AND {table_name}.partition = {table0_name}.partition", + ).unwrap(); + } + } + bench_query( + c, + &ctx, + &rt, + "IO: INNER JOIN (nullable), all tables, all partitions", + &join_query, + TABLE_PARTITIONS * PARTITION_FILES * FILE_ROWS / 2, + ); + bench_query( + c, + &ctx, + &rt, + "IO: INNER JOIN (nullable), all tables, single partition", + &format!("{join_query} WHERE {table0_name}.partition = 0"), + PARTITION_FILES * FILE_ROWS / 2, + ); } criterion_group!(benches, criterion_benchmark);