synnada-ai · alihandroid · Jul 10, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/datafusion/core/src/physical_optimizer/limit_pushdown.rs b/datafusion/core/src/physical_optimizer/limit_pushdown.rs
diff --git a/datafusion/core/src/physical_optimizer/mod.rs b/datafusion/core/src/physical_optimizer/mod.rs
@@ -27,6 +27,7 @@ pub mod combine_partial_final_agg;
 pub mod enforce_distribution;
 pub mod enforce_sorting;
 pub mod join_selection;
+pub mod limit_pushdown;
 pub mod limited_distinct_aggregation;
 pub mod optimizer;
 pub mod output_requirements;

diff --git a/datafusion/core/src/physical_optimizer/optimizer.rs b/datafusion/core/src/physical_optimizer/optimizer.rs
@@ -28,6 +28,7 @@ use crate::physical_optimizer::combine_partial_final_agg::CombinePartialFinalAgg
 use crate::physical_optimizer::enforce_distribution::EnforceDistribution;
 use crate::physical_optimizer::enforce_sorting::EnforceSorting;
 use crate::physical_optimizer::join_selection::JoinSelection;
+use crate::physical_optimizer::limit_pushdown::LimitPushdown;
 use crate::physical_optimizer::limited_distinct_aggregation::LimitedDistinctAggregation;
 use crate::physical_optimizer::output_requirements::OutputRequirements;
 use crate::physical_optimizer::sanity_checker::SanityCheckPlan;
@@ -98,6 +99,10 @@ impl PhysicalOptimizer {
             // are not present, the load of executors such as join or union will be
             // reduced by narrowing their input tables.
             Arc::new(ProjectionPushdown::new()),
+            // The LimitPushdown rule tries to push limits down as far as possible,
+            // replacing ExecutionPlans with fetching versions or adding limits
+            // past ExecutionPlans that support limit pushdown.
+            Arc::new(LimitPushdown::new()),
             // The SanityCheckPlan rule checks whether the order and
             // distribution requirements of each node in the plan
             // is satisfied. It will also reject non-runnable query

diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs
@@ -222,17 +222,18 @@ async fn sort_preserving_merge() {
             // SortPreservingMergeExec (not a Sort which would compete
             // with the SortPreservingMergeExec for memory)
             &[
-                "+---------------+-------------------------------------------------------------------------------------------------------------+",
-                "| plan_type     | plan                                                                                                        |",
-                "+---------------+-------------------------------------------------------------------------------------------------------------+",
-                "| logical_plan  | Limit: skip=0, fetch=10                                                                                     |",
-                "|               |   Sort: t.a ASC NULLS LAST, t.b ASC NULLS LAST, fetch=10                                                    |",
-                "|               |     TableScan: t projection=[a, b]                                                                          |",
-                "| physical_plan | GlobalLimitExec: skip=0, fetch=10                                                                           |",
-                "|               |   SortPreservingMergeExec: [a@0 ASC NULLS LAST,b@1 ASC NULLS LAST], fetch=10                                |",
-                "|               |     MemoryExec: partitions=2, partition_sizes=[5, 5], output_ordering=a@0 ASC NULLS LAST,b@1 ASC NULLS LAST |",
-                "|               |                                                                                                             |",
-                "+---------------+-------------------------------------------------------------------------------------------------------------+",
+                "+---------------+---------------------------------------------------------------------------------------------------------------+",
+                "| plan_type     | plan                                                                                                          |",
+                "+---------------+---------------------------------------------------------------------------------------------------------------+",
+                "| logical_plan  | Limit: skip=0, fetch=10                                                                                       |",
+                "|               |   Sort: t.a ASC NULLS LAST, t.b ASC NULLS LAST, fetch=10                                                      |",
+                "|               |     TableScan: t projection=[a, b]                                                                            |",
+                "| physical_plan | GlobalLimitExec: skip=0, fetch=10                                                                             |",
+                "|               |   SortPreservingMergeExec: [a@0 ASC NULLS LAST,b@1 ASC NULLS LAST], fetch=10                                  |",
+                "|               |     LocalLimitExec: fetch=10                                                                                  |",
+                "|               |       MemoryExec: partitions=2, partition_sizes=[5, 5], output_ordering=a@0 ASC NULLS LAST,b@1 ASC NULLS LAST |",
+                "|               |                                                                                                               |",
+                "+---------------+---------------------------------------------------------------------------------------------------------------+",
             ]
         )
         .run()

diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs
@@ -46,6 +46,9 @@ pub struct CoalesceBatchesExec {
     input: Arc<dyn ExecutionPlan>,
     /// Minimum number of rows for coalesces batches
     target_batch_size: usize,
+    /// Maximum number of rows to fetch,
+    /// `None` means fetching all rows
+    fetch: Option<usize>,
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     cache: PlanProperties,
@@ -58,11 +61,18 @@ impl CoalesceBatchesExec {
         Self {
             input,
             target_batch_size,
+            fetch: None,
             metrics: ExecutionPlanMetricsSet::new(),
             cache,
         }
     }
 
+    /// Update fetch with the argument
+    pub fn with_fetch(mut self, fetch: Option<usize>) -> Self {
+        self.fetch = fetch;
+        self
+    }
+
     /// The input plan
     pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
         &self.input
@@ -96,8 +106,13 @@ impl DisplayAs for CoalesceBatchesExec {
                 write!(
                     f,
                     "CoalesceBatchesExec: target_batch_size={}",
-                    self.target_batch_size
-                )
+                    self.target_batch_size,
+                )?;
+                if let Some(fetch) = self.fetch {
+                    write!(f, ", fetch={fetch}")?;
+                };
+
+                Ok(())
             }
         }
     }
@@ -133,10 +148,10 @@ impl ExecutionPlan for CoalesceBatchesExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        Ok(Arc::new(CoalesceBatchesExec::new(
-            Arc::clone(&children[0]),
-            self.target_batch_size,
-        )))
+        Ok(Arc::new(
+            CoalesceBatchesExec::new(Arc::clone(&children[0]), self.target_batch_size)
+                .with_fetch(self.fetch),
+        ))
     }
 
     fn execute(
@@ -148,8 +163,10 @@ impl ExecutionPlan for CoalesceBatchesExec {
             input: self.input.execute(partition, context)?,
             schema: self.input.schema(),
             target_batch_size: self.target_batch_size,
+            fetch: self.fetch,
             buffer: Vec::new(),
             buffered_rows: 0,
+            total_rows: 0,
             is_closed: false,
             baseline_metrics: BaselineMetrics::new(&self.metrics, partition),
         }))
@@ -162,6 +179,16 @@ impl ExecutionPlan for CoalesceBatchesExec {
     fn statistics(&self) -> Result<Statistics> {
         self.input.statistics()
     }
+
+    fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
+        Some(Arc::new(CoalesceBatchesExec {
+            input: Arc::clone(&self.input),
+            target_batch_size: self.target_batch_size,
+            fetch: limit,
+            metrics: self.metrics.clone(),
+            cache: self.cache.clone(),
+        }))
+    }
 }
 
 struct CoalesceBatchesStream {
@@ -171,10 +198,14 @@ struct CoalesceBatchesStream {
     schema: SchemaRef,
     /// Minimum number of rows for coalesces batches
     target_batch_size: usize,
+    /// Maximum number of rows to fetch,
+    fetch: Option<usize>,
     /// Buffered batches
     buffer: Vec<RecordBatch>,
     /// Buffered row count
     buffered_rows: usize,
+    /// Total number of rows returned
+    total_rows: usize,
     /// Whether the stream has finished returning all of its data or not
     is_closed: bool,
     /// Execution metrics
@@ -216,6 +247,39 @@ impl CoalesceBatchesStream {
             match input_batch {
                 Poll::Ready(x) => match x {
                     Some(Ok(batch)) => {
+                        // handle fetch limit
+                        if let Some(fetch) = self.fetch {
+                            if self.total_rows + batch.num_rows() >= fetch {
+                                // we have reached the fetch limit
+                                let remaining_rows = fetch - self.total_rows;
+                                // Shouldn't be empty
+                                debug_assert!(remaining_rows > 0);
+
+                                self.is_closed = true;
+                                self.total_rows = fetch;
+                                // trim the batch
+                                let batch = batch.slice(0, remaining_rows);
+                                // add to the buffered batches
+                                self.buffered_rows += batch.num_rows();
+                                self.buffer.push(batch);
+                                // combine the batches and return
+                                let batch = concat_batches(
+                                    &self.schema,
+                                    &self.buffer,
+                                    self.buffered_rows,
+                                )?;
+                                // reset buffer state
+                                self.buffer.clear();
+                                self.buffered_rows = 0;
+                                // return batch
+                                return Poll::Ready(Some(Ok(batch)));
+                            } else {
+                                self.total_rows += batch.num_rows();
+                            }
+                        } else {
+                            self.total_rows += batch.num_rows();
+                        }
+
                         if batch.num_rows() >= self.target_batch_size
                             && self.buffer.is_empty()
                         {
@@ -304,7 +368,7 @@ mod tests {
         let partition = create_vec_batches(&schema, 10);
         let partitions = vec![partition];
 
-        let output_partitions = coalesce_batches(&schema, partitions, 21).await?;
+        let output_partitions = coalesce_batches(&schema, partitions, 21, None).await?;
         assert_eq!(1, output_partitions.len());
 
         // input is 10 batches x 8 rows (80 rows)
@@ -319,6 +383,86 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_concat_batches_with_fetch_larger_than_input_size() -> Result<()> {
+        let schema = test_schema();
+        let partition = create_vec_batches(&schema, 10);
+        let partitions = vec![partition];
+
+        let output_partitions =
+            coalesce_batches(&schema, partitions, 21, Some(100)).await?;
+        assert_eq!(1, output_partitions.len());
+
+        // input is 10 batches x 8 rows (80 rows) with fetch limit of 100
+        // expected to behave the same as `test_concat_batches`
+        let batches = &output_partitions[0];
+        assert_eq!(4, batches.len());
+        assert_eq!(24, batches[0].num_rows());
+        assert_eq!(24, batches[1].num_rows());
+        assert_eq!(24, batches[2].num_rows());
+        assert_eq!(8, batches[3].num_rows());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_concat_batches_with_fetch_less_than_input_size() -> Result<()> {
+        let schema = test_schema();
+        let partition = create_vec_batches(&schema, 10);
+        let partitions = vec![partition];
+
+        let output_partitions =
+            coalesce_batches(&schema, partitions, 21, Some(50)).await?;
+        assert_eq!(1, output_partitions.len());
+
+        // input is 10 batches x 8 rows (80 rows) with fetch limit of 50
+        let batches = &output_partitions[0];
+        assert_eq!(3, batches.len());
+        assert_eq!(24, batches[0].num_rows());
+        assert_eq!(24, batches[1].num_rows());
+        assert_eq!(2, batches[2].num_rows());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_concat_batches_with_fetch_less_than_target_and_no_remaining_rows(
+    ) -> Result<()> {
+        let schema = test_schema();
+        let partition = create_vec_batches(&schema, 10);
+        let partitions = vec![partition];
+
+        let output_partitions =
+            coalesce_batches(&schema, partitions, 21, Some(48)).await?;
+        assert_eq!(1, output_partitions.len());
+
+        // input is 10 batches x 8 rows (80 rows) with fetch limit of 48
+        let batches = &output_partitions[0];
+        assert_eq!(2, batches.len());
+        assert_eq!(24, batches[0].num_rows());
+        assert_eq!(24, batches[1].num_rows());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_concat_batches_with_fetch_less_target_batch_size() -> Result<()> {
+        let schema = test_schema();
+        let partition = create_vec_batches(&schema, 10);
+        let partitions = vec![partition];
+
+        let output_partitions =
+            coalesce_batches(&schema, partitions, 21, Some(10)).await?;
+        assert_eq!(1, output_partitions.len());
+
+        // input is 10 batches x 8 rows (80 rows) with fetch limit of 10
+        let batches = &output_partitions[0];
+        assert_eq!(1, batches.len());
+        assert_eq!(10, batches[0].num_rows());
+
+        Ok(())
+    }
+
     fn test_schema() -> Arc<Schema> {
         Arc::new(Schema::new(vec![Field::new("c0", DataType::UInt32, false)]))
     }
@@ -327,13 +471,15 @@ mod tests {
         schema: &SchemaRef,
         input_partitions: Vec<Vec<RecordBatch>>,
         target_batch_size: usize,
+        fetch: Option<usize>,
     ) -> Result<Vec<Vec<RecordBatch>>> {
         // create physical plan
         let exec = MemoryExec::try_new(&input_partitions, Arc::clone(schema), None)?;
         let exec =
             RepartitionExec::try_new(Arc::new(exec), Partitioning::RoundRobinBatch(1))?;
-        let exec: Arc<dyn ExecutionPlan> =
-            Arc::new(CoalesceBatchesExec::new(Arc::new(exec), target_batch_size));
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(
+            CoalesceBatchesExec::new(Arc::new(exec), target_batch_size).with_fetch(fetch),
+        );
 
         // execute and collect results
         let output_partition_count = exec.output_partitioning().partition_count();

diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs
@@ -174,6 +174,10 @@ impl ExecutionPlan for CoalescePartitionsExec {
     fn statistics(&self) -> Result<Statistics> {
         self.input.statistics()
     }
+
+    fn supports_limit_pushdown(&self) -> bool {
+        true
+    }
 }
 
 #[cfg(test)]

diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
@@ -428,6 +428,18 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     fn statistics(&self) -> Result<Statistics> {
         Ok(Statistics::new_unknown(&self.schema()))
     }
+
+    /// Returns `true` if a limit can be safely pushed down through this
+    /// `ExecutionPlan` node.
+    fn supports_limit_pushdown(&self) -> bool {
+        false
+    }
+
+    /// Returns a fetching version of this `ExecutionPlan` node, if it supports
+    /// fetch limits. Returns `None` otherwise.
+    fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
+        None
+    }
 }
 
 /// Extension trait provides an easy API to fetch various properties of

diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs
@@ -245,6 +245,10 @@ impl ExecutionPlan for ProjectionExec {
             Arc::clone(&self.schema),
         ))
     }
+
+    fn supports_limit_pushdown(&self) -> bool {
+        true
+    }
 }
 
 /// If e is a direct column reference, returns the field level

diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs
@@ -926,6 +926,17 @@ impl ExecutionPlan for SortExec {
     fn statistics(&self) -> Result<Statistics> {
         self.input.statistics()
     }
+
+    fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
+        Some(Arc::new(SortExec {
+            input: Arc::clone(&self.input),
+            expr: self.expr.clone(),
+            metrics_set: self.metrics_set.clone(),
+            preserve_partitioning: self.preserve_partitioning,
+            fetch: limit,
+            cache: self.cache.clone(),
+        }))
+    }
 }
 
 #[cfg(test)]

diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
@@ -264,6 +264,10 @@ impl ExecutionPlan for SortPreservingMergeExec {
     fn statistics(&self) -> Result<Statistics> {
         self.input.statistics()
     }
+
+    fn supports_limit_pushdown(&self) -> bool {
+        true
+    }
 }
 
 #[cfg(test)]