Eventual-Inc · Vince7778 · Jul 31, 2024 · Jul 22, 2024 · Jul 22, 2024 · Jul 23, 2024
diff --git a/daft/context.py b/daft/context.py
@@ -275,6 +275,7 @@ def set_execution_config(
     broadcast_join_size_bytes_threshold: int | None = None,
     parquet_split_row_groups_max_files: int | None = None,
     sort_merge_join_sort_with_aligned_boundaries: bool | None = None,
+    hash_join_partition_size_leniency: bool | None = None,
     sample_size_for_sort: int | None = None,
     num_preview_rows: int | None = None,
     parquet_target_filesize: int | None = None,
@@ -305,6 +306,9 @@ def set_execution_config(
         sort_merge_join_sort_with_aligned_boundaries: Whether to use a specialized algorithm for sorting both sides of a
             sort-merge join such that they have aligned boundaries. This can lead to a faster merge-join at the cost of
             more skewed sorted join inputs, increasing the risk of OOMs.
+        hash_join_partition_size_leniency: If the left side of a hash join is already correctly partitioned and the right side isn't,
+            and the ratio between the left and right size is at least this value, then the right side is repartitioned to have an equal
+            number of partitions as the left. Defaults to 0.5.
         sample_size_for_sort: number of elements to sample from each partition when running sort,
             Default is 20.
         num_preview_rows: number of rows to when showing a dataframe preview,

diff --git a/daft/daft.pyi b/daft/daft.pyi
@@ -1674,6 +1674,7 @@ class PyDaftExecutionConfig:
         broadcast_join_size_bytes_threshold: int | None = None,
         parquet_split_row_groups_max_files: int | None = None,
         sort_merge_join_sort_with_aligned_boundaries: bool | None = None,
+        hash_join_partition_size_leniency: float | None = None,
         sample_size_for_sort: int | None = None,
         num_preview_rows: int | None = None,
         parquet_target_filesize: int | None = None,
@@ -1695,6 +1696,8 @@ class PyDaftExecutionConfig:
     @property
     def sort_merge_join_sort_with_aligned_boundaries(self) -> bool: ...
     @property
+    def hash_join_partition_size_leniency(self) -> float: ...
+    @property
     def sample_size_for_sort(self) -> int: ...
     @property
     def num_preview_rows(self) -> int: ...

diff --git a/src/common/daft-config/src/lib.rs b/src/common/daft-config/src/lib.rs
@@ -30,6 +30,7 @@ pub struct DaftExecutionConfig {
     pub scan_tasks_max_size_bytes: usize,
     pub broadcast_join_size_bytes_threshold: usize,
     pub sort_merge_join_sort_with_aligned_boundaries: bool,
+    pub hash_join_partition_size_leniency: f64,
     pub sample_size_for_sort: usize,
     pub parquet_split_row_groups_max_files: usize,
     pub num_preview_rows: usize,
@@ -51,6 +52,7 @@ impl Default for DaftExecutionConfig {
             scan_tasks_max_size_bytes: 384 * 1024 * 1024, // 384MB
             broadcast_join_size_bytes_threshold: 10 * 1024 * 1024, // 10 MiB
             sort_merge_join_sort_with_aligned_boundaries: false,
+            hash_join_partition_size_leniency: 0.5,
             sample_size_for_sort: 20,
             parquet_split_row_groups_max_files: 10,
             num_preview_rows: 8,

diff --git a/src/common/daft-config/src/python.rs b/src/common/daft-config/src/python.rs
@@ -90,6 +90,7 @@ impl PyDaftExecutionConfig {
         broadcast_join_size_bytes_threshold: Option<usize>,
         parquet_split_row_groups_max_files: Option<usize>,
         sort_merge_join_sort_with_aligned_boundaries: Option<bool>,
+        hash_join_partition_size_leniency: Option<f64>,
         sample_size_for_sort: Option<usize>,
         num_preview_rows: Option<usize>,
         parquet_target_filesize: Option<usize>,
@@ -122,6 +123,9 @@ impl PyDaftExecutionConfig {
             config.sort_merge_join_sort_with_aligned_boundaries =
                 sort_merge_join_sort_with_aligned_boundaries;
         }
+        if let Some(hash_join_partition_size_leniency) = hash_join_partition_size_leniency {
+            config.hash_join_partition_size_leniency = hash_join_partition_size_leniency;
+        }
         if let Some(sample_size_for_sort) = sample_size_for_sort {
             config.sample_size_for_sort = sample_size_for_sort;
         }
@@ -183,6 +187,11 @@ impl PyDaftExecutionConfig {
         Ok(self.config.sort_merge_join_sort_with_aligned_boundaries)
     }
 
+    #[getter]
+    fn get_hash_join_partition_size_leniency(&self) -> PyResult<f64> {
+        Ok(self.config.hash_join_partition_size_leniency)
+    }
+
     #[getter]
     fn get_sample_size_for_sort(&self) -> PyResult<usize> {
         Ok(self.config.sample_size_for_sort)

diff --git a/src/daft-dsl/src/expr.rs b/src/daft-dsl/src/expr.rs
@@ -4,6 +4,7 @@ use daft_core::{
     schema::Schema,
     utils::supertype::try_get_supertype,
 };
+use itertools::Itertools;
 
 use crate::{
     functions::{
@@ -1253,6 +1254,14 @@ pub fn resolve_aggexprs(
     itertools::process_results(resolved_iter, |res| res.unzip())
 }
 
+// Check if one set of columns is a reordering of the other
+pub fn is_partition_compatible(a: &[ExprRef], b: &[ExprRef]) -> bool {
+    // sort a and b by name
+    let a: Vec<&str> = a.iter().map(|a| a.name()).sorted().collect();
+    let b: Vec<&str> = b.iter().map(|a| a.name()).sorted().collect();
+    a == b
+}
+
 #[cfg(test)]
 mod tests {
 

diff --git a/src/daft-dsl/src/lib.rs b/src/daft-dsl/src/lib.rs
@@ -14,7 +14,9 @@ mod treenode;
 pub use common_treenode;
 pub use expr::binary_op;
 pub use expr::col;
-pub use expr::{resolve_aggexpr, resolve_aggexprs, resolve_expr, resolve_exprs};
+pub use expr::{
+    is_partition_compatible, resolve_aggexpr, resolve_aggexprs, resolve_expr, resolve_exprs,
+};
 pub use expr::{AggExpr, ApproxPercentileParams, Expr, ExprRef, Operator};
 pub use lit::{lit, null_lit, Literal, LiteralValue};
 #[cfg(feature = "python")]

diff --git a/src/daft-plan/src/lib.rs b/src/daft-plan/src/lib.rs
@@ -10,6 +10,7 @@ mod logical_optimization;
 mod logical_plan;
 mod partitioning;
 pub mod physical_ops;
+mod physical_optimization;
 mod physical_plan;
 mod physical_planner;
 mod resource_request;

diff --git a/src/daft-plan/src/physical_ops/project.rs b/src/daft-plan/src/physical_ops/project.rs
@@ -20,13 +20,28 @@ pub struct Project {
 }
 
 impl Project {
+    // uses input to create output clustering spec
     pub(crate) fn try_new(
         input: PhysicalPlanRef,
         projection: Vec<ExprRef>,
         resource_request: ResourceRequest,
+    ) -> DaftResult<Self> {
+        let clustering_spec = translate_clustering_spec(input.clustering_spec(), &projection);
+        Ok(Self {
+            input,
+            projection,
+            resource_request,
+            clustering_spec,
+        })
+    }
+
+    // does not re-create clustering spec, unlike try_new
+    pub(crate) fn new_with_clustering_spec(
+        input: PhysicalPlanRef,
+        projection: Vec<ExprRef>,
+        resource_request: ResourceRequest,
         clustering_spec: Arc<ClusteringSpec>,
     ) -> DaftResult<Self> {
-        let clustering_spec = translate_clustering_spec(clustering_spec, &projection);
         Ok(Self {
             input,
             projection,

diff --git a/src/daft-plan/src/physical_optimization/mod.rs b/src/daft-plan/src/physical_optimization/mod.rs
@@ -0,0 +1,3 @@
+pub mod optimizer;
+mod plan_context;
+mod rules;