Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 72 additions & 2 deletions datafusion/physical-optimizer/src/coalesce_batches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ use datafusion_common::config::ConfigOptions;
use datafusion_common::error::Result;
use datafusion_physical_expr::Partitioning;
use datafusion_physical_plan::{
coalesce_batches::CoalesceBatchesExec, filter::FilterExec, joins::HashJoinExec,
repartition::RepartitionExec, ExecutionPlan,
aggregates::{AggregateExec, AggregateMode}, coalesce_batches::CoalesceBatchesExec, filter::FilterExec,
joins::HashJoinExec, repartition::RepartitionExec, ExecutionPlan,
};

use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
Expand Down Expand Up @@ -92,3 +92,73 @@ impl PhysicalOptimizerRule for CoalesceBatches {
true
}
}

/// Remove CoalesceBatchesExec that are in front of a AggregateExec
#[derive(Default, Debug)]
pub struct UnCoalesceBatches {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if we instead can avoid adding them in the CoalesceBatches optimizer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Certainly something to attempt. I've not done it (yet) because it's not necessary to evaluate the impact of this change


impl UnCoalesceBatches {
#[allow(missing_docs)]
pub fn new() -> Self {
Self::default()
}
}

impl PhysicalOptimizerRule for UnCoalesceBatches {
fn optimize(
&self,
plan: Arc<dyn ExecutionPlan>,
config: &ConfigOptions,
) -> Result<Arc<dyn ExecutionPlan>> {
if !config.execution.coalesce_batches {
return Ok(plan);
}

plan.transform_up(|plan| {
if let Some(aggregate) = plan.as_any().downcast_ref::<AggregateExec>() {
let agg_input = aggregate.input();

if aggregate.mode() != &AggregateMode::Partial {
if let Some(coalesce) =
plan.as_any().downcast_ref::<CoalesceBatchesExec>()
{
let coalesce_input = coalesce.input();

return Ok(Transformed::yes(
agg_input
.clone()
.with_new_children(vec![coalesce_input.clone()])?,
));
}
}
}

if let Some(hash_join) = plan.as_any().downcast_ref::<HashJoinExec>() {
let children = hash_join.children();
if let Some(coalesce) = hash_join
.left()
.as_any()
.downcast_ref::<CoalesceBatchesExec>()
{
let coalesce_input = coalesce.input();

return Ok(Transformed::yes(plan.clone().with_new_children(vec![
coalesce_input.clone(),
children[1].clone(),
])?));
}
}

Ok(Transformed::no(plan))
})
.data()
}

fn name(&self) -> &str {
"uncoalesce_batches"
}

fn schema_check(&self) -> bool {
true
}
}
5 changes: 3 additions & 2 deletions datafusion/physical-optimizer/src/optimizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use std::fmt::Debug;
use std::sync::Arc;

use crate::aggregate_statistics::AggregateStatistics;
use crate::coalesce_batches::CoalesceBatches;
use crate::coalesce_batches::{CoalesceBatches, UnCoalesceBatches};
use crate::combine_partial_final_agg::CombinePartialFinalAggregate;
use crate::enforce_distribution::EnforceDistribution;
use crate::enforce_sorting::EnforceSorting;
Expand Down Expand Up @@ -110,9 +110,10 @@ impl PhysicalOptimizer {
Arc::new(OptimizeAggregateOrder::new()),
// TODO: `try_embed_to_hash_join` in the ProjectionPushdown rule would be block by the CoalesceBatches, so add it before CoalesceBatches. Maybe optimize it in the future.
Arc::new(ProjectionPushdown::new()),
// The CoalesceBatches rule will not influence the distribution and ordering of the
// The CoalesceBatches/UnCoalesceBatches rule will not influence the distribution and ordering of the
// whole plan tree. Therefore, to avoid influencing other rules, it should run last.
Arc::new(CoalesceBatches::new()),
Arc::new(UnCoalesceBatches::new()),
// Remove the ancillary output requirement operator since we are done with the planning
// phase.
Arc::new(OutputRequirements::new_remove_mode()),
Expand Down
Loading