Patched DF 46.0.1 #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

crepererum wants to merge 4 commits into base-df-upgrade-ver4601 from upgrade-df-ver4601-a

datafusion/common/src/config.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -314,7 +314,7 @@ config_namespace! { @@
             ///
             /// This is used to workaround bugs in the planner that are now caught by
             /// the new schema verification step.
-            pub skip_physical_aggregate_schema_check: bool, default = false
+            pub skip_physical_aggregate_schema_check: bool, default = true
             /// Specifies the reserved memory for each spillable sort operation to
             /// facilitate an in-memory merge.
@@ Expand Down @@

datafusion/core/src/physical_planner.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -688,6 +688,9 @@ impl DefaultPhysicalPlanner { @@
                                 differences.push(format!("field nullability at index {} [{}]: (physical) {} vs (logical) {}", i, physical_field.name(), physical_field.is_nullable(), logical_field.is_nullable()));
                             }
                         }
+                        log::warn!("Physical input schema should be the same as the one converted from logical input schema, but did not match for logical plan:\n{}", input.display_indent());
                         return internal_err!("Physical input schema should be the same as the one converted from logical input schema. Differences: {}", differences
                             .iter()
                             .map(|s| format!("\n\t- {}", s))
@@ Expand Down @@

datafusion/core/tests/physical_optimizer/enforce_distribution.rs

Large diffs are not rendered by default.

datafusion/core/tests/physical_optimizer/enforce_sorting.rs

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -17,12 +17,15 @@
  
    use std::sync::Arc;

    use crate::physical_optimizer::enforce_distribution::{

        parquet_exec_with_stats, projection_exec_with_alias,

    };

    use crate::physical_optimizer::test_utils::{

        aggregate_exec, bounded_window_exec, check_integrity, coalesce_batches_exec,

        coalesce_partitions_exec, create_test_schema, create_test_schema2,

        create_test_schema3, filter_exec, global_limit_exec, hash_join_exec, limit_exec,

        local_limit_exec, memory_exec, parquet_exec, repartition_exec, sort_exec, sort_expr,

        sort_expr_options, sort_merge_join_exec, sort_preserving_merge_exec,

        local_limit_exec, memory_exec, parquet_exec, repartition_exec, schema, sort_exec,

        sort_expr, sort_expr_options, sort_merge_join_exec, sort_preserving_merge_exec,

        sort_preserving_merge_exec_with_fetch, spr_repartition_exec, stream_exec_ordered,

        union_exec, RequirementsTestExec,

    };

    @@ -38,6 +41,8 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
  
    use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};

    use datafusion_physical_expr::expressions::{col, Column, NotExpr};

    use datafusion_physical_expr::Partitioning;

    use datafusion_physical_optimizer::sanity_checker::SanityCheckPlan;

    use datafusion_physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};

    use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;

    use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};

    use datafusion_physical_plan::repartition::RepartitionExec;

    @@ -1909,6 +1914,92 @@ async fn test_commutativity() -> Result<()> {
  
        Ok(())

    }

    fn single_partition_aggregate(

        input: Arc<dyn ExecutionPlan>,

        alias_pairs: Vec<(String, String)>,

    ) -> Arc<dyn ExecutionPlan> {

        let schema = schema();

        let group_by = alias_pairs

            .iter()

            .map(|(column, alias)| (col(column, &input.schema()).unwrap(), alias.to_string()))

            .collect::<Vec<_>>();

        let group_by = PhysicalGroupBy::new_single(group_by);

        Arc::new(

            AggregateExec::try_new(

                AggregateMode::SinglePartitioned,

                group_by,

                vec![],

                vec![],

                input,

                schema,

            )

            .unwrap(),

        )

    }

    #[tokio::test]

    async fn test_preserve_needed_coalesce() -> Result<()> {

        // Input to EnforceSorting, from our test case.

        let plan = projection_exec_with_alias(

            union_exec(vec![parquet_exec_with_stats(); 2]),

            vec![

                ("a".to_string(), "a".to_string()),

                ("b".to_string(), "value".to_string()),

            ],

        );

        let plan = Arc::new(CoalescePartitionsExec::new(plan));

        let schema = schema();

        let sort_key = LexOrdering::new(vec![PhysicalSortExpr {

            expr: col("a", &schema).unwrap(),

            options: SortOptions::default(),

        }]);

        let plan: Arc<dyn ExecutionPlan> =

            single_partition_aggregate(plan, vec![("a".to_string(), "a1".to_string())]);

        let plan = sort_exec(sort_key, plan);

        // Starting plan: as in our test case.

        assert_eq!(

            get_plan_string(&plan),

            vec![

                "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",

                "  AggregateExec: mode=SinglePartitioned, gby=[a@0 as a1], aggr=[]",

                "    CoalescePartitionsExec",

                "      ProjectionExec: expr=[a@0 as a, b@1 as value]",

                "        UnionExec",

                "          ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",

                "          ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",

            ],

        );

        let checker = SanityCheckPlan::new().optimize(plan.clone(), &Default::default());

        assert!(checker.is_ok());

        // EnforceSorting will remove the coalesce, and add an SPM further up (above the aggregate).

        let optimizer = EnforceSorting::new();

        let optimized = optimizer.optimize(plan, &Default::default())?;

        assert_eq!(

            get_plan_string(&optimized),

            vec![

                "SortPreservingMergeExec: [a@0 ASC]",

                "  SortExec: expr=[a@0 ASC], preserve_partitioning=[false]",

                "    AggregateExec: mode=SinglePartitioned, gby=[a@0 as a1], aggr=[]",

                "      CoalescePartitionsExec",

                "        ProjectionExec: expr=[a@0 as a, b@1 as value]",

                "          UnionExec",

                "            ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",

                "            ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",

            ],

        );

        // Plan is valid.

        let checker = SanityCheckPlan::new();

        let checker = checker.optimize(optimized, &Default::default());

        assert!(checker.is_ok());

        Ok(())

    }

    #[tokio::test]

    async fn test_coalesce_propagate() -> Result<()> {

        let schema = create_test_schema()?;

datafusion/physical-expr/src/equivalence/properties/union.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -76,17 +76,38 @@ fn calculate_union_binary( @@
             })
             .collect::<Vec<_>>();
+        // TEMP HACK WORKAROUND
+        // Revert code from https://github.com/apache/datafusion/pull/12562
+        // Context: https://github.com/apache/datafusion/issues/13748
+        // Context: https://github.com/influxdata/influxdb_iox/issues/13038
         // Next, calculate valid orderings for the union by searching for prefixes
         // in both sides.
-        let mut orderings = UnionEquivalentOrderingBuilder::new();
-        orderings.add_satisfied_orderings(lhs.normalized_oeq_class(), lhs.constants(), &rhs);
-        orderings.add_satisfied_orderings(rhs.normalized_oeq_class(), rhs.constants(), &lhs);
-        let orderings = orderings.build();
-        let mut eq_properties =
-            EquivalenceProperties::new(lhs.schema).with_constants(constants);
+        let mut orderings = vec![];
+        for mut ordering in lhs.normalized_oeq_class().into_iter() {
+            // Progressively shorten the ordering to search for a satisfied prefix:
+            while !rhs.ordering_satisfy(&ordering) {
+                ordering.pop();
+            }
+            // There is a non-trivial satisfied prefix, add it as a valid ordering:
+            if !ordering.is_empty() {
+                orderings.push(ordering);
+            }
+        }
+        for mut ordering in rhs.normalized_oeq_class().into_iter() {
+            // Progressively shorten the ordering to search for a satisfied prefix:
+            while !lhs.ordering_satisfy(&ordering) {
+                ordering.pop();
+            }
+            // There is a non-trivial satisfied prefix, add it as a valid ordering:
+            if !ordering.is_empty() {
+                orderings.push(ordering);
+            }
+        }
+        let mut eq_properties = EquivalenceProperties::new(lhs.schema);
+        eq_properties.constants = constants;
         eq_properties.add_new_orderings(orderings);
         Ok(eq_properties)
     }
@@ Expand Down Expand Up / @@ -132,6 +153,7 @@ struct UnionEquivalentOrderingBuilder { @@
         orderings: Vec<LexOrdering>,
     }
+    #[expect(unused)]
     impl UnionEquivalentOrderingBuilder {
         fn new() -> Self {
             Self { orderings: vec![] }
@@ Expand Down @@

datafusion/physical-optimizer/src/enforce_sorting/mod.rs

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -47,8 +47,8 @@ use crate::enforce_sorting::sort_pushdown::{
  
        assign_initial_requirements, pushdown_sorts, SortPushDown,

    };

    use crate::utils::{

        add_sort_above, add_sort_above_with_check, is_coalesce_partitions, is_limit,

        is_repartition, is_sort, is_sort_preserving_merge, is_union, is_window,

        add_sort_above, add_sort_above_with_check, is_aggregation, is_coalesce_partitions,

        is_limit, is_repartition, is_sort, is_sort_preserving_merge, is_union, is_window,

    };

    use crate::PhysicalOptimizerRule;

    @@ -624,7 +624,7 @@ fn remove_bottleneck_in_subplan(
  
    ) -> Result<PlanWithCorrespondingCoalescePartitions> {

        let plan = &requirements.plan;

        let children = &mut requirements.children;

        if is_coalesce_partitions(&children[0].plan) {

        if is_coalesce_partitions(&children[0].plan) && !is_aggregation(plan) {

            // We can safely use the 0th index since we have a `CoalescePartitionsExec`.

            let mut new_child_node = children[0].children.swap_remove(0);

            while new_child_node.plan.output_partitioning() == plan.output_partitioning()

datafusion/physical-optimizer/src/sanity_checker.rs

-Original file line number
+Diff line change
@@ Expand Up @@
     use datafusion_physical_expr::intervals::utils::{check_support, is_datatype_supported};
     use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
     use datafusion_physical_plan::joins::SymmetricHashJoinExec;
+    use datafusion_physical_plan::sorts::sort::SortExec;
+    use datafusion_physical_plan::union::UnionExec;
     use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties};
     use crate::PhysicalOptimizerRule;
@@ Expand Down Expand Up / @@ -135,6 +137,14 @@ pub fn check_plan_sanity( @@
             plan.required_input_ordering(),
             plan.required_input_distribution(),
         ) {
+            // TEMP HACK WORKAROUND https://github.com/apache/datafusion/issues/11492
+            if child.as_any().downcast_ref::<UnionExec>().is_some() {
+                continue;
+            }
+            if child.as_any().downcast_ref::<SortExec>().is_some() {
+                continue;
+            }
             let child_eq_props = child.equivalence_properties();
             if let Some(sort_req) = sort_req {
                 if !child_eq_props.ordering_satisfy_requirement(&sort_req) {
@@ Expand Down @@

datafusion/physical-optimizer/src/utils.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -19,6 +19,7 @@ use std::sync::Arc; @@
     use datafusion_physical_expr::LexRequirement;
     use datafusion_physical_expr_common::sort_expr::LexOrdering;
+    use datafusion_physical_plan::aggregates::AggregateExec;
     use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
     use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
     use datafusion_physical_plan::repartition::RepartitionExec;
@@ Expand Down Expand Up @@
     pub fn is_limit(plan: &Arc<dyn ExecutionPlan>) -> bool {
         plan.as_any().is::<GlobalLimitExec>() || plan.as_any().is::<LocalLimitExec>()
     }
+    // Checks whether the given operator is a [`AggregateExec`].
+    pub fn is_aggregation(plan: &Arc<dyn ExecutionPlan>) -> bool {
+        plan.as_any().is::<AggregateExec>()
+    }

datafusion/sqllogictest/test_files/information_schema.slt

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -224,7 +224,7 @@ datafusion.execution.parquet.writer_version 1.0
  
    datafusion.execution.planning_concurrency 13

    datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8

    datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000

    datafusion.execution.skip_physical_aggregate_schema_check false

    datafusion.execution.skip_physical_aggregate_schema_check true

    datafusion.execution.soft_max_rows_per_output_file 50000000

    datafusion.execution.sort_in_place_threshold_bytes 1048576

    datafusion.execution.sort_spill_reservation_bytes 10485760

    @@ -321,7 +321,7 @@ datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer ve
  
    datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system

    datafusion.execution.skip_partial_aggregation_probe_ratio_threshold 0.8 Aggregation ratio (number of distinct groups / number of input rows) threshold for skipping partial aggregation. If the value is greater then partial aggregation will skip aggregation for further input

    datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode

    datafusion.execution.skip_physical_aggregate_schema_check false When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.

    datafusion.execution.skip_physical_aggregate_schema_check true When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.

    datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max

    datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.

    datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).

docs/source/user-guide/configs.md

-Original file line number
+Diff line change
@@ Expand Up @@
     | datafusion.execution.parquet.maximum_parallel_row_group_writers         | 1                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                           |
     | datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2                         | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame.                                                                                           |
     | datafusion.execution.planning_concurrency                               | 0                         | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system                                                                                                                                                                                                                                                                                                                                                                                                                     |
-    | datafusion.execution.skip_physical_aggregate_schema_check               | false                     | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.                                                                                                                                                                        |
+    | datafusion.execution.skip_physical_aggregate_schema_check               | true                      | When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step.                                                                                                                                                                        |
     | datafusion.execution.sort_spill_reservation_bytes                       | 10485760                  | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).                                                                                                                                        |
     | datafusion.execution.sort_in_place_threshold_bytes                      | 1048576                   | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.                                                                                                                                                                                                                                                                                                                                                                                                                                   |
     | datafusion.execution.meta_fetch_concurrency                             | 32                        | Number of files to read in parallel when inferring schema and statistics                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Patched DF 46.0.1 #64

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!