apache · andygrove · Dec 21, 2020 · Dec 21, 2020 · Dec 21, 2020 · Dec 21, 2020
diff --git a/rust/benchmarks/src/bin/tpch.rs b/rust/benchmarks/src/bin/tpch.rs
@@ -18,7 +18,6 @@
 //! Benchmark derived from TPC-H. This is not an official TPC-H benchmark.
 
 use std::path::{Path, PathBuf};
-use std::sync::Arc;
 use std::time::Instant;
 
 use arrow::datatypes::{DataType, DateUnit, Field, Schema};
@@ -28,7 +27,6 @@ use datafusion::datasource::{CsvFile, MemTable, TableProvider};
 use datafusion::error::{DataFusionError, Result};
 use datafusion::logical_plan::LogicalPlan;
 use datafusion::physical_plan::collect;
-use datafusion::physical_plan::csv::CsvExec;
 use datafusion::prelude::*;
 
 use parquet::basic::Compression;
@@ -87,6 +85,10 @@ struct ConvertOpt {
     /// Compression to use when writing Parquet files
     #[structopt(short = "c", long = "compression", default_value = "snappy")]
     compression: String,
+
+    /// Number of partitions to produce
+    #[structopt(short = "p", long = "partitions", default_value = "1")]
+    partitions: usize,
 }
 
 #[derive(Debug, StructOpt)]
@@ -1017,8 +1019,21 @@ async fn convert_tbl(opt: ConvertOpt) -> Result<()> {
             .delimiter(b'|')
             .file_extension(".tbl");
 
-        let ctx = ExecutionContext::new();
-        let csv = Arc::new(CsvExec::try_new(&input_path, options, None, 4096)?);
+        let mut ctx = ExecutionContext::new();
+
+        // build plan to read the TBL file
+        let mut csv = ctx.read_csv(&input_path, options)?;
+
+        // optionally, repartition the file
+        if opt.partitions > 1 {
+            csv = csv.repartition(Partitioning::RoundRobinBatch(opt.partitions))?
+        }
+
+        // create the physical plan
+        let csv = csv.to_logical_plan();
+        let csv = ctx.optimize(&csv)?;
+        let csv = ctx.create_physical_plan(&csv)?;
+
         let output_path = output_root_path.join(table);
         let output_path = output_path.to_str().unwrap().to_owned();
 

diff --git a/rust/datafusion/src/dataframe.rs b/rust/datafusion/src/dataframe.rs
@@ -19,7 +19,9 @@
 
 use crate::arrow::record_batch::RecordBatch;
 use crate::error::Result;
-use crate::logical_plan::{DFSchema, Expr, FunctionRegistry, JoinType, LogicalPlan};
+use crate::logical_plan::{
+    DFSchema, Expr, FunctionRegistry, JoinType, LogicalPlan, Partitioning,
+};
 use std::sync::Arc;
 
 use async_trait::async_trait;
@@ -172,6 +174,23 @@ pub trait DataFrame {
         right_cols: &[&str],
     ) -> Result<Arc<dyn DataFrame>>;
 
+    /// Repartition a DataFrame based on a logical partitioning scheme.
+    ///
+    /// ```
+    /// # use datafusion::prelude::*;
+    /// # use datafusion::error::Result;
+    /// # fn main() -> Result<()> {
+    /// let mut ctx = ExecutionContext::new();
+    /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
+    /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?;
+    /// # Ok(())
+    /// # }
+    /// ```
+    fn repartition(
+        &self,
+        partitioning_scheme: Partitioning,
+    ) -> Result<Arc<dyn DataFrame>>;
+
     /// Executes this DataFrame and collects all results into a vector of RecordBatch.
     ///
     /// ```

diff --git a/rust/datafusion/src/execution/dataframe_impl.rs b/rust/datafusion/src/execution/dataframe_impl.rs
@@ -24,6 +24,7 @@ use crate::error::Result;
 use crate::execution::context::{ExecutionContext, ExecutionContextState};
 use crate::logical_plan::{
     col, DFSchema, Expr, FunctionRegistry, JoinType, LogicalPlan, LogicalPlanBuilder,
+    Partitioning,
 };
 use crate::{arrow::record_batch::RecordBatch, physical_plan::collect};
 
@@ -111,6 +112,16 @@ impl DataFrame for DataFrameImpl {
         Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan)))
     }
 
+    fn repartition(
+        &self,
+        partitioning_scheme: Partitioning,
+    ) -> Result<Arc<dyn DataFrame>> {
+        let plan = LogicalPlanBuilder::from(&self.plan)
+            .repartition(partitioning_scheme)?
+            .build()?;
+        Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan)))
+    }
+
     /// Convert to logical plan
     fn to_logical_plan(&self) -> LogicalPlan {
         self.plan.clone()

diff --git a/rust/datafusion/src/logical_plan/builder.rs b/rust/datafusion/src/logical_plan/builder.rs
@@ -35,7 +35,7 @@ use super::dfschema::ToDFSchema;
 use super::{
     col, exprlist_to_fields, Expr, JoinType, LogicalPlan, PlanType, StringifiedPlan,
 };
-use crate::logical_plan::{DFField, DFSchema, DFSchemaRef};
+use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, Partitioning};
 use std::collections::HashSet;
 
 /// Builder for logical plans
@@ -207,6 +207,14 @@ impl LogicalPlanBuilder {
         }
     }
 
+    /// Repartition
+    pub fn repartition(&self, partitioning_scheme: Partitioning) -> Result<Self> {
+        Ok(Self::from(&LogicalPlan::Repartition {
+            input: Arc::new(self.plan.clone()),
+            partitioning_scheme,
+        }))
+    }
+
     /// Apply an aggregate
     pub fn aggregate(&self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr>) -> Result<Self> {
         let mut all_expr: Vec<Expr> = group_expr.clone();

diff --git a/rust/datafusion/src/logical_plan/mod.rs b/rust/datafusion/src/logical_plan/mod.rs
@@ -41,5 +41,7 @@ pub use expr::{
 };
 pub use extension::UserDefinedLogicalNode;
 pub use operators::Operator;
-pub use plan::{JoinType, LogicalPlan, PlanType, PlanVisitor, StringifiedPlan};
+pub use plan::{
+    JoinType, LogicalPlan, Partitioning, PlanType, PlanVisitor, StringifiedPlan,
+};
 pub use registry::FunctionRegistry;
diff --git a/rust/datafusion/src/logical_plan/plan.rs b/rust/datafusion/src/logical_plan/plan.rs
@@ -110,6 +110,13 @@ pub enum LogicalPlan {
         /// The output schema, containing fields from the left and right inputs
         schema: DFSchemaRef,
     },
+    /// Repartition the plan based on a partitioning scheme.
+    Repartition {
+        /// The incoming logical plan
+        input: Arc<LogicalPlan>,
+        /// The partitioning scheme
+        partitioning_scheme: Partitioning,
+    },
     /// Produces rows from a table provider by reference or from the context
     TableScan {
         /// The name of the table
@@ -182,6 +189,7 @@ impl LogicalPlan {
             LogicalPlan::Aggregate { schema, .. } => &schema,
             LogicalPlan::Sort { input, .. } => input.schema(),
             LogicalPlan::Join { schema, .. } => &schema,
+            LogicalPlan::Repartition { input, .. } => input.schema(),
             LogicalPlan::Limit { input, .. } => input.schema(),
             LogicalPlan::CreateExternalTable { schema, .. } => &schema,
             LogicalPlan::Explain { schema, .. } => &schema,
@@ -198,6 +206,17 @@ impl LogicalPlan {
     }
 }
 
+/// Logical partitioning schemes supported by the repartition operator.
+#[derive(Debug, Clone)]
+pub enum Partitioning {
+    /// Allocate batches using a round-robin algorithm and the specified number of partitions
+    RoundRobinBatch(usize),
+    /// Allocate rows based on a hash of one of more expressions and the specified number
+    /// of partitions.
+    /// This partitioning scheme is not yet fully supported. See https://issues.apache.org/jira/browse/ARROW-11011
+    Hash(Vec<Expr>, usize),
+}
+
 /// Trait that implements the [Visitor
 /// pattern](https://en.wikipedia.org/wiki/Visitor_pattern) for a
 /// depth first walk of `LogicalPlan` nodes. `pre_visit` is called
@@ -261,6 +280,7 @@ impl LogicalPlan {
         let recurse = match self {
             LogicalPlan::Projection { input, .. } => input.accept(visitor)?,
             LogicalPlan::Filter { input, .. } => input.accept(visitor)?,
+            LogicalPlan::Repartition { input, .. } => input.accept(visitor)?,
             LogicalPlan::Aggregate { input, .. } => input.accept(visitor)?,
             LogicalPlan::Sort { input, .. } => input.accept(visitor)?,
             LogicalPlan::Join { left, right, .. } => {
@@ -464,7 +484,7 @@ impl LogicalPlan {
         struct Wrapper<'a>(&'a LogicalPlan);
         impl<'a> fmt::Display for Wrapper<'a> {
             fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-                match *self.0 {
+                match &*self.0 {
                     LogicalPlan::EmptyRelation { .. } => write!(f, "EmptyRelation"),
                     LogicalPlan::TableScan {
                         ref table_name,
@@ -523,6 +543,28 @@ impl LogicalPlan {
                             keys.iter().map(|(l, r)| format!("{} = {}", l, r)).collect();
                         write!(f, "Join: {}", join_expr.join(", "))
                     }
+                    LogicalPlan::Repartition {
+                        partitioning_scheme,
+                        ..
+                    } => match partitioning_scheme {
+                        Partitioning::RoundRobinBatch(n) => {
+                            write!(
+                                f,
+                                "Repartition: RoundRobinBatch partition_count={}",
+                                n
+                            )
+                        }
+                        Partitioning::Hash(expr, n) => {
+                            let hash_expr: Vec<String> =
+                                expr.iter().map(|e| format!("{:?}", e)).collect();
+                            write!(
+                                f,
+                                "Repartition: Hash({}) partition_count={}",
+                                hash_expr.join(", "),
+                                n
+                            )
+                        }
+                    },
                     LogicalPlan::Limit { ref n, .. } => write!(f, "Limit: {}", n),
                     LogicalPlan::CreateExternalTable { ref name, .. } => {
                         write!(f, "CreateExternalTable: {:?}", name)

diff --git a/rust/datafusion/src/optimizer/hash_build_probe_order.rs b/rust/datafusion/src/optimizer/hash_build_probe_order.rs
@@ -117,6 +117,7 @@ impl OptimizerRule for HashBuildProbeOrder {
             | LogicalPlan::TableScan { .. }
             | LogicalPlan::Limit { .. }
             | LogicalPlan::Filter { .. }
+            | LogicalPlan::Repartition { .. }
             | LogicalPlan::EmptyRelation { .. }
             | LogicalPlan::Sort { .. }
             | LogicalPlan::CreateExternalTable { .. }

diff --git a/rust/datafusion/src/optimizer/projection_push_down.rs b/rust/datafusion/src/optimizer/projection_push_down.rs
@@ -275,6 +275,7 @@ fn optimize_plan(
         // expressions in this node to the list of required columns
         LogicalPlan::Limit { .. }
         | LogicalPlan::Filter { .. }
+        | LogicalPlan::Repartition { .. }
         | LogicalPlan::EmptyRelation { .. }
         | LogicalPlan::Sort { .. }
         | LogicalPlan::CreateExternalTable { .. }

diff --git a/rust/datafusion/src/optimizer/utils.rs b/rust/datafusion/src/optimizer/utils.rs
@@ -24,7 +24,7 @@ use arrow::datatypes::Schema;
 use super::optimizer::OptimizerRule;
 use crate::error::{DataFusionError, Result};
 use crate::logical_plan::{
-    Expr, LogicalPlan, Operator, PlanType, StringifiedPlan, ToDFSchema,
+    Expr, LogicalPlan, Operator, Partitioning, PlanType, StringifiedPlan, ToDFSchema,
 };
 use crate::prelude::{col, lit};
 use crate::scalar::ScalarValue;
@@ -140,6 +140,13 @@ pub fn expressions(plan: &LogicalPlan) -> Vec<Expr> {
     match plan {
         LogicalPlan::Projection { expr, .. } => expr.clone(),
         LogicalPlan::Filter { predicate, .. } => vec![predicate.clone()],
+        LogicalPlan::Repartition {
+            partitioning_scheme,
+            ..
+        } => match partitioning_scheme {
+            Partitioning::Hash(expr, _) => expr.clone(),
+            _ => vec![],
+        },
         LogicalPlan::Aggregate {
             group_expr,
             aggr_expr,
@@ -168,6 +175,7 @@ pub fn inputs(plan: &LogicalPlan) -> Vec<&LogicalPlan> {
     match plan {
         LogicalPlan::Projection { input, .. } => vec![input],
         LogicalPlan::Filter { input, .. } => vec![input],
+        LogicalPlan::Repartition { input, .. } => vec![input],
         LogicalPlan::Aggregate { input, .. } => vec![input],
         LogicalPlan::Sort { input, .. } => vec![input],
         LogicalPlan::Join { left, right, .. } => vec![left, right],
@@ -197,6 +205,19 @@ pub fn from_plan(
             predicate: expr[0].clone(),
             input: Arc::new(inputs[0].clone()),
         }),
+        LogicalPlan::Repartition {
+            partitioning_scheme,
+            ..
+        } => match partitioning_scheme {
+            Partitioning::RoundRobinBatch(n) => Ok(LogicalPlan::Repartition {
+                partitioning_scheme: Partitioning::RoundRobinBatch(*n),
+                input: Arc::new(inputs[0].clone()),
+            }),
+            Partitioning::Hash(_, n) => Ok(LogicalPlan::Repartition {
+                partitioning_scheme: Partitioning::Hash(expr.to_owned(), *n),
+                input: Arc::new(inputs[0].clone()),
+            }),
+        },
         LogicalPlan::Aggregate {
             group_expr, schema, ..
         } => Ok(LogicalPlan::Aggregate {

diff --git a/rust/datafusion/src/physical_plan/mod.rs b/rust/datafusion/src/physical_plan/mod.rs
@@ -107,7 +107,13 @@ pub async fn collect(plan: Arc<dyn ExecutionPlan>) -> Result<Vec<RecordBatch>> {
 /// Partitioning schemes supported by operators.
 #[derive(Debug, Clone)]
 pub enum Partitioning {
-    /// Unknown partitioning scheme
+    /// Allocate batches using a round-robin algorithm and the specified number of partitions
+    RoundRobinBatch(usize),
+    /// Allocate rows based on a hash of one of more expressions and the specified
+    /// number of partitions
+    /// This partitioning scheme is not yet fully supported. See https://issues.apache.org/jira/browse/ARROW-11011
+    Hash(Vec<Arc<dyn PhysicalExpr>>, usize),
+    /// Unknown partitioning scheme with a known number of partitions
     UnknownPartitioning(usize),
 }
 
@@ -116,6 +122,8 @@ impl Partitioning {
     pub fn partition_count(&self) -> usize {
         use Partitioning::*;
         match self {
+            RoundRobinBatch(n) => *n,
+            Hash(_, n) => *n,
             UnknownPartitioning(n) => *n,
         }
     }
@@ -260,6 +268,7 @@ pub mod merge;
 pub mod parquet;
 pub mod planner;
 pub mod projection;
+pub mod repartition;
 pub mod sort;
 pub mod string_expressions;
 pub mod type_coercion;

diff --git a/rust/datafusion/src/physical_plan/planner.rs b/rust/datafusion/src/physical_plan/planner.rs
@@ -23,21 +23,22 @@ use super::{aggregates, empty::EmptyExec, expressions::binary, functions, udaf};
 use crate::error::{DataFusionError, Result};
 use crate::execution::context::ExecutionContextState;
 use crate::logical_plan::{
-    DFSchema, Expr, LogicalPlan, Operator, PlanType, StringifiedPlan,
-    UserDefinedLogicalNode,
+    DFSchema, Expr, LogicalPlan, Operator, Partitioning as LogicalPartitioning, PlanType,
+    StringifiedPlan, UserDefinedLogicalNode,
 };
 use crate::physical_plan::explain::ExplainExec;
 use crate::physical_plan::expressions::{CaseExpr, Column, Literal, PhysicalSortExpr};
 use crate::physical_plan::filter::FilterExec;
 use crate::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
 use crate::physical_plan::hash_join::HashJoinExec;
-use crate::physical_plan::hash_utils;
 use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use crate::physical_plan::merge::MergeExec;
 use crate::physical_plan::projection::ProjectionExec;
+use crate::physical_plan::repartition::RepartitionExec;
 use crate::physical_plan::sort::SortExec;
 use crate::physical_plan::udf;
 use crate::physical_plan::{expressions, Distribution};
+use crate::physical_plan::{hash_utils, Partitioning};
 use crate::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, PhysicalPlanner};
 use crate::prelude::JoinType;
 use crate::variable::VarType;
@@ -228,6 +229,31 @@ impl DefaultPhysicalPlanner {
                     self.create_physical_expr(predicate, &input_schema, ctx_state)?;
                 Ok(Arc::new(FilterExec::try_new(runtime_expr, input)?))
             }
+            LogicalPlan::Repartition {
+                input,
+                partitioning_scheme,
+            } => {
+                let input = self.create_physical_plan(input, ctx_state)?;
+                let input_schema = input.schema();
+                let physical_partitioning = match partitioning_scheme {
+                    LogicalPartitioning::RoundRobinBatch(n) => {
+                        Partitioning::RoundRobinBatch(*n)
+                    }
+                    LogicalPartitioning::Hash(expr, n) => {
+                        let runtime_expr = expr
+                            .iter()
+                            .map(|e| {
+                                self.create_physical_expr(e, &input_schema, &ctx_state)
+                            })
+                            .collect::<Result<Vec<_>>>()?;
+                        Partitioning::Hash(runtime_expr, *n)
+                    }
+                };
+                Ok(Arc::new(RepartitionExec::try_new(
+                    input,
+                    physical_partitioning,
+                )?))
+            }
             LogicalPlan::Sort { expr, input, .. } => {
                 let input = self.create_physical_plan(input, ctx_state)?;
                 let input_schema = input.as_ref().schema();