From 7339b59b6326aa6aac81dc8a71cddb2832b6d00f Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Tue, 16 Dec 2025 21:57:18 -0500 Subject: [PATCH 01/14] Added join testdata and method to generate parquet from csv. --- testdata/join/csv/dim/data_A0.csv | 4 ++ testdata/join/csv/dim/data_B0.csv | 5 +++ testdata/join/csv/dim/data_C0.csv | 4 ++ testdata/join/csv/dim/data_D0.csv | 6 +++ testdata/join/csv/fact/data_A0.csv | 4 ++ testdata/join/csv/fact/data_B0.csv | 5 +++ testdata/join/csv/fact/data_B1.csv | 6 +++ testdata/join/csv/fact/data_B2.csv | 3 ++ testdata/join/csv/fact/data_C0.csv | 4 ++ testdata/join/csv/fact/data_C1.csv | 4 ++ testdata/join/csv/fact/data_D0.csv | 8 ++++ testdata/join/generate_parquet_from_csv.sql | 48 +++++++++++++++++++++ testdata/join/parquet/dim/data_A0.parquet | 3 ++ testdata/join/parquet/dim/data_B0.parquet | 3 ++ testdata/join/parquet/dim/data_C0.parquet | 3 ++ testdata/join/parquet/dim/data_D0.parquet | 3 ++ testdata/join/parquet/fact/data_A0.parquet | 3 ++ testdata/join/parquet/fact/data_B0.parquet | 3 ++ testdata/join/parquet/fact/data_B1.parquet | 3 ++ testdata/join/parquet/fact/data_B2.parquet | 3 ++ testdata/join/parquet/fact/data_C0.parquet | 3 ++ testdata/join/parquet/fact/data_C1.parquet | 3 ++ testdata/join/parquet/fact/data_D0.parquet | 3 ++ 23 files changed, 134 insertions(+) create mode 100644 testdata/join/csv/dim/data_A0.csv create mode 100644 testdata/join/csv/dim/data_B0.csv create mode 100644 testdata/join/csv/dim/data_C0.csv create mode 100644 testdata/join/csv/dim/data_D0.csv create mode 100644 testdata/join/csv/fact/data_A0.csv create mode 100644 testdata/join/csv/fact/data_B0.csv create mode 100644 testdata/join/csv/fact/data_B1.csv create mode 100644 testdata/join/csv/fact/data_B2.csv create mode 100644 testdata/join/csv/fact/data_C0.csv create mode 100644 testdata/join/csv/fact/data_C1.csv create mode 100644 testdata/join/csv/fact/data_D0.csv create mode 100644 testdata/join/generate_parquet_from_csv.sql create mode 100644 testdata/join/parquet/dim/data_A0.parquet create mode 100644 testdata/join/parquet/dim/data_B0.parquet create mode 100644 testdata/join/parquet/dim/data_C0.parquet create mode 100644 testdata/join/parquet/dim/data_D0.parquet create mode 100644 testdata/join/parquet/fact/data_A0.parquet create mode 100644 testdata/join/parquet/fact/data_B0.parquet create mode 100644 testdata/join/parquet/fact/data_B1.parquet create mode 100644 testdata/join/parquet/fact/data_B2.parquet create mode 100644 testdata/join/parquet/fact/data_C0.parquet create mode 100644 testdata/join/parquet/fact/data_C1.parquet create mode 100644 testdata/join/parquet/fact/data_D0.parquet diff --git a/testdata/join/csv/dim/data_A0.csv b/testdata/join/csv/dim/data_A0.csv new file mode 100644 index 00000000..9e45781b --- /dev/null +++ b/testdata/join/csv/dim/data_A0.csv @@ -0,0 +1,4 @@ +key,col1,col2,col3 +0,a,b,c +1,d,e,f + diff --git a/testdata/join/csv/dim/data_B0.csv b/testdata/join/csv/dim/data_B0.csv new file mode 100644 index 00000000..085a329c --- /dev/null +++ b/testdata/join/csv/dim/data_B0.csv @@ -0,0 +1,5 @@ +key,col1,col2,col3 +2,g,h,i +3,j,k,l +5,m,n,o + diff --git a/testdata/join/csv/dim/data_C0.csv b/testdata/join/csv/dim/data_C0.csv new file mode 100644 index 00000000..aa44a343 --- /dev/null +++ b/testdata/join/csv/dim/data_C0.csv @@ -0,0 +1,4 @@ +key,col1,col2,col3 +6,p,q,r +8,s,t,u + diff --git a/testdata/join/csv/dim/data_D0.csv b/testdata/join/csv/dim/data_D0.csv new file mode 100644 index 00000000..df52870f --- /dev/null +++ b/testdata/join/csv/dim/data_D0.csv @@ -0,0 +1,6 @@ +key,col1,col2,col3 +10,v,w,x +11,y,z,a +15,b,c,d +18,e,f,g + diff --git a/testdata/join/csv/fact/data_A0.csv b/testdata/join/csv/fact/data_A0.csv new file mode 100644 index 00000000..f1355f30 --- /dev/null +++ b/testdata/join/csv/fact/data_A0.csv @@ -0,0 +1,4 @@ +key,col1,col2,col3 +0,z,y,x +1,w,v,u + diff --git a/testdata/join/csv/fact/data_B0.csv b/testdata/join/csv/fact/data_B0.csv new file mode 100644 index 00000000..12272e6a --- /dev/null +++ b/testdata/join/csv/fact/data_B0.csv @@ -0,0 +1,5 @@ +key,col1,col2,col3 +2,t,s,r +3,q,p,o +3,n,m,l + diff --git a/testdata/join/csv/fact/data_B1.csv b/testdata/join/csv/fact/data_B1.csv new file mode 100644 index 00000000..3923cef3 --- /dev/null +++ b/testdata/join/csv/fact/data_B1.csv @@ -0,0 +1,6 @@ +key,col1,col2,col3 +2,h,g,f +3,e,d,c +3,b,a,z +4,y,x,w + diff --git a/testdata/join/csv/fact/data_B2.csv b/testdata/join/csv/fact/data_B2.csv new file mode 100644 index 00000000..5b4f2a8e --- /dev/null +++ b/testdata/join/csv/fact/data_B2.csv @@ -0,0 +1,3 @@ +key,col1,col2,col3 +5,v,u,t + diff --git a/testdata/join/csv/fact/data_C0.csv b/testdata/join/csv/fact/data_C0.csv new file mode 100644 index 00000000..0d131567 --- /dev/null +++ b/testdata/join/csv/fact/data_C0.csv @@ -0,0 +1,4 @@ +key,col1,col2,col3 +6,v,u,t +7,s,r,q + diff --git a/testdata/join/csv/fact/data_C1.csv b/testdata/join/csv/fact/data_C1.csv new file mode 100644 index 00000000..b322fdb8 --- /dev/null +++ b/testdata/join/csv/fact/data_C1.csv @@ -0,0 +1,4 @@ +key,col1,col2,col3 +8,p,o,n +8,m,l,k + diff --git a/testdata/join/csv/fact/data_D0.csv b/testdata/join/csv/fact/data_D0.csv new file mode 100644 index 00000000..435b4ea9 --- /dev/null +++ b/testdata/join/csv/fact/data_D0.csv @@ -0,0 +1,8 @@ +key,col1,col2,col3 +9,j,i,h +10,g,f,e +11,d,c,b +11,a,z,y +12,x,w,v +15,u,t,s + diff --git a/testdata/join/generate_parquet_from_csv.sql b/testdata/join/generate_parquet_from_csv.sql new file mode 100644 index 00000000..9023b3b5 --- /dev/null +++ b/testdata/join/generate_parquet_from_csv.sql @@ -0,0 +1,48 @@ +-- datafusion-cli -f testdata/join/generate_parquet_from_csv.sql + +-- Generate parquet dim files from csv files. +COPY (SELECT * FROM "testdata/join/csv/dim/data_A0.csv") +TO "testdata/join/parquet/dim/data_A0.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/dim/data_B0.csv") +TO "testdata/join/parquet/dim/data_B0.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/dim/data_C0.csv") +TO "testdata/join/parquet/dim/data_C0.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/dim/data_D0.csv") +TO "testdata/join/parquet/dim/data_D0.parquet" +STORED AS PARQUET; + +-- Generate parquet fact files from csv files. +COPY (SELECT * FROM "testdata/join/csv/fact/data_A0.csv") +TO "testdata/join/parquet/fact/data_A0.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/fact/data_B0.csv") +TO "testdata/join/parquet/fact/data_B0.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/fact/data_B1.csv") +TO "testdata/join/parquet/fact/data_B1.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/fact/data_B2.csv") +TO "testdata/join/parquet/fact/data_B2.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/fact/data_C0.csv") +TO "testdata/join/parquet/fact/data_C0.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/fact/data_C1.csv") +TO "testdata/join/parquet/fact/data_C1.parquet" +STORED AS PARQUET; + +COPY (SELECT * FROM "testdata/join/csv/fact/data_D0.csv") +TO "testdata/join/parquet/fact/data_D0.parquet" +STORED AS PARQUET; + diff --git a/testdata/join/parquet/dim/data_A0.parquet b/testdata/join/parquet/dim/data_A0.parquet new file mode 100644 index 00000000..2ba5dcdc --- /dev/null +++ b/testdata/join/parquet/dim/data_A0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78b746b8a11a7a5103145038cc9d1ab3524469dd6a4db11084ca85bdd7c577ec +size 1304 diff --git a/testdata/join/parquet/dim/data_B0.parquet b/testdata/join/parquet/dim/data_B0.parquet new file mode 100644 index 00000000..f239920d --- /dev/null +++ b/testdata/join/parquet/dim/data_B0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dcffe93f049bb3517c4db75c228647d167bceca5cdcc2188f8cf5d1ab65f892 +size 1328 diff --git a/testdata/join/parquet/dim/data_C0.parquet b/testdata/join/parquet/dim/data_C0.parquet new file mode 100644 index 00000000..11ac436c --- /dev/null +++ b/testdata/join/parquet/dim/data_C0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd7adc8388ba8d6aef8487bc8c234712b4d91dcb02f7a20b174a3beb05d24751 +size 1304 diff --git a/testdata/join/parquet/dim/data_D0.parquet b/testdata/join/parquet/dim/data_D0.parquet new file mode 100644 index 00000000..3f00302f --- /dev/null +++ b/testdata/join/parquet/dim/data_D0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2926b2db48fadba952f5d91d669adf05e034c2b12d20a94106ef4833bc2afe +size 1344 diff --git a/testdata/join/parquet/fact/data_A0.parquet b/testdata/join/parquet/fact/data_A0.parquet new file mode 100644 index 00000000..8078329e --- /dev/null +++ b/testdata/join/parquet/fact/data_A0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd2f4be607947d2870b1b8bf83e9aefe8646864efab52bc3768b7b0d2c4e2e1a +size 1304 diff --git a/testdata/join/parquet/fact/data_B0.parquet b/testdata/join/parquet/fact/data_B0.parquet new file mode 100644 index 00000000..8a1f4cb4 --- /dev/null +++ b/testdata/join/parquet/fact/data_B0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ac1a92ca2adab640d02b6c95374f3b5fa5ef51a6a5391ef30fb36d9767a8f7b +size 1322 diff --git a/testdata/join/parquet/fact/data_B1.parquet b/testdata/join/parquet/fact/data_B1.parquet new file mode 100644 index 00000000..3e445f28 --- /dev/null +++ b/testdata/join/parquet/fact/data_B1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff883e2d28c950dbf5fbe6cffcc2f4bf4df0b4810e13d2eee9f6f3b4faff3a22 +size 1343 diff --git a/testdata/join/parquet/fact/data_B2.parquet b/testdata/join/parquet/fact/data_B2.parquet new file mode 100644 index 00000000..0bc3e1b7 --- /dev/null +++ b/testdata/join/parquet/fact/data_B2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:067a64494be0b3f670fe1c8c44af71413daab4992aaa237fb9c089d88f47c15b +size 1274 diff --git a/testdata/join/parquet/fact/data_C0.parquet b/testdata/join/parquet/fact/data_C0.parquet new file mode 100644 index 00000000..02c7a110 --- /dev/null +++ b/testdata/join/parquet/fact/data_C0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67ca4367d7c20feb7b7e6ec14bf28f07566bb5bc6ab540722c171ec4e7900da2 +size 1304 diff --git a/testdata/join/parquet/fact/data_C1.parquet b/testdata/join/parquet/fact/data_C1.parquet new file mode 100644 index 00000000..6812fa76 --- /dev/null +++ b/testdata/join/parquet/fact/data_C1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23554d4ad9cb220fbf225dad268153b4d0b4649d61f81f0f5eca174366517909 +size 1295 diff --git a/testdata/join/parquet/fact/data_D0.parquet b/testdata/join/parquet/fact/data_D0.parquet new file mode 100644 index 00000000..56966642 --- /dev/null +++ b/testdata/join/parquet/fact/data_D0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac11a954def357d8758ce5211e51a6e6321c29eeabbf3d91ecf5a2fc129493e +size 1383 From 16fb14d7f8ca06b49400357ec66cdec145e0efea Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Wed, 17 Dec 2025 11:27:53 -0500 Subject: [PATCH 02/14] Added basic single node join test for comparison. --- tests/join.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/join.rs diff --git a/tests/join.rs b/tests/join.rs new file mode 100644 index 00000000..9c13029a --- /dev/null +++ b/tests/join.rs @@ -0,0 +1,38 @@ +#[cfg(all(feature = "integration", test))] +mod tests { + use arrow::util::pretty; + use datafusion::{ + physical_plan::{collect, displayable}, + prelude::{ParquetReadOptions, SessionContext}, + }; + + #[tokio::test] + async fn test_join() -> Result<(), Box> { + let ctx = SessionContext::new(); + ctx.register_parquet( + "dim", + "testdata/join/parquet/dim", + ParquetReadOptions::new(), + ) + .await?; + ctx.register_parquet( + "fact", + "testdata/join/parquet/fact", + ParquetReadOptions::new(), + ) + .await?; + + let sql = "SELECT * FROM dim JOIN fact ON dim.key = fact.key"; + let df = ctx.sql(sql).await?; + + let (state, logical_plan) = df.into_parts(); + let physical_plan = state.create_physical_plan(&logical_plan).await?; + println!("\n——————— PHYSICAL PLAN ———————\n"); + println!("{}", displayable(physical_plan.as_ref()).indent(true)); + + let result = collect(physical_plan, state.task_ctx()).await?; + pretty::print_batches(&result)?; + Ok(()) + } +} + From c181e3456404e13de967db3376cdf80a9c75dae0 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Thu, 18 Dec 2025 14:29:31 -0500 Subject: [PATCH 03/14] Use hive partitioning. --- testdata/join/csv/dim/d_dkey=A/data0.csv | 3 + testdata/join/csv/dim/d_dkey=B/data0.csv | 4 ++ testdata/join/csv/dim/d_dkey=C/data0.csv | 4 ++ testdata/join/csv/dim/d_dkey=D/data0.csv | 4 ++ testdata/join/csv/dim/data_A0.csv | 4 -- testdata/join/csv/dim/data_B0.csv | 5 -- testdata/join/csv/dim/data_C0.csv | 4 -- testdata/join/csv/dim/data_D0.csv | 6 -- testdata/join/csv/fact/d_dkey=A/data0.csv | 9 +++ testdata/join/csv/fact/d_dkey=B/data0.csv | 9 +++ testdata/join/csv/fact/d_dkey=B/data1.csv | 8 +++ testdata/join/csv/fact/d_dkey=B/data2.csv | 7 ++ testdata/join/csv/fact/d_dkey=C/data0.csv | 9 +++ testdata/join/csv/fact/d_dkey=C/data1.csv | 9 +++ testdata/join/csv/fact/d_dkey=D/data0.csv | 5 ++ testdata/join/csv/fact/data_A0.csv | 4 -- testdata/join/csv/fact/data_B0.csv | 5 -- testdata/join/csv/fact/data_B1.csv | 6 -- testdata/join/csv/fact/data_B2.csv | 3 - testdata/join/csv/fact/data_C0.csv | 4 -- testdata/join/csv/fact/data_C1.csv | 4 -- testdata/join/csv/fact/data_D0.csv | 8 --- testdata/join/generate_parquet_from_csv.sql | 44 ++++++------- .../join/parquet/dim/d_dkey=A/data0.parquet | 3 + .../join/parquet/dim/d_dkey=B/data0.parquet | 3 + .../join/parquet/dim/d_dkey=C/data0.parquet | 3 + .../join/parquet/dim/d_dkey=D/data0.parquet | 3 + testdata/join/parquet/dim/data_A0.parquet | 3 - testdata/join/parquet/dim/data_B0.parquet | 3 - testdata/join/parquet/dim/data_C0.parquet | 3 - testdata/join/parquet/dim/data_D0.parquet | 3 - .../join/parquet/fact/d_dkey=A/data0.parquet | 3 + .../join/parquet/fact/d_dkey=B/data0.parquet | 3 + .../join/parquet/fact/d_dkey=B/data1.parquet | 3 + .../join/parquet/fact/d_dkey=B/data2.parquet | 3 + .../join/parquet/fact/d_dkey=C/data0.parquet | 3 + .../join/parquet/fact/d_dkey=C/data1.parquet | 3 + .../join/parquet/fact/d_dkey=D/data0.parquet | 3 + testdata/join/parquet/fact/data_A0.parquet | 3 - testdata/join/parquet/fact/data_B0.parquet | 3 - testdata/join/parquet/fact/data_B1.parquet | 3 - testdata/join/parquet/fact/data_B2.parquet | 3 - testdata/join/parquet/fact/data_C0.parquet | 3 - testdata/join/parquet/fact/data_C1.parquet | 3 - testdata/join/parquet/fact/data_D0.parquet | 3 - tests/join.rs | 66 +++++++++++++++++-- 46 files changed, 186 insertions(+), 114 deletions(-) create mode 100644 testdata/join/csv/dim/d_dkey=A/data0.csv create mode 100644 testdata/join/csv/dim/d_dkey=B/data0.csv create mode 100644 testdata/join/csv/dim/d_dkey=C/data0.csv create mode 100644 testdata/join/csv/dim/d_dkey=D/data0.csv delete mode 100644 testdata/join/csv/dim/data_A0.csv delete mode 100644 testdata/join/csv/dim/data_B0.csv delete mode 100644 testdata/join/csv/dim/data_C0.csv delete mode 100644 testdata/join/csv/dim/data_D0.csv create mode 100644 testdata/join/csv/fact/d_dkey=A/data0.csv create mode 100644 testdata/join/csv/fact/d_dkey=B/data0.csv create mode 100644 testdata/join/csv/fact/d_dkey=B/data1.csv create mode 100644 testdata/join/csv/fact/d_dkey=B/data2.csv create mode 100644 testdata/join/csv/fact/d_dkey=C/data0.csv create mode 100644 testdata/join/csv/fact/d_dkey=C/data1.csv create mode 100644 testdata/join/csv/fact/d_dkey=D/data0.csv delete mode 100644 testdata/join/csv/fact/data_A0.csv delete mode 100644 testdata/join/csv/fact/data_B0.csv delete mode 100644 testdata/join/csv/fact/data_B1.csv delete mode 100644 testdata/join/csv/fact/data_B2.csv delete mode 100644 testdata/join/csv/fact/data_C0.csv delete mode 100644 testdata/join/csv/fact/data_C1.csv delete mode 100644 testdata/join/csv/fact/data_D0.csv create mode 100644 testdata/join/parquet/dim/d_dkey=A/data0.parquet create mode 100644 testdata/join/parquet/dim/d_dkey=B/data0.parquet create mode 100644 testdata/join/parquet/dim/d_dkey=C/data0.parquet create mode 100644 testdata/join/parquet/dim/d_dkey=D/data0.parquet delete mode 100644 testdata/join/parquet/dim/data_A0.parquet delete mode 100644 testdata/join/parquet/dim/data_B0.parquet delete mode 100644 testdata/join/parquet/dim/data_C0.parquet delete mode 100644 testdata/join/parquet/dim/data_D0.parquet create mode 100644 testdata/join/parquet/fact/d_dkey=A/data0.parquet create mode 100644 testdata/join/parquet/fact/d_dkey=B/data0.parquet create mode 100644 testdata/join/parquet/fact/d_dkey=B/data1.parquet create mode 100644 testdata/join/parquet/fact/d_dkey=B/data2.parquet create mode 100644 testdata/join/parquet/fact/d_dkey=C/data0.parquet create mode 100644 testdata/join/parquet/fact/d_dkey=C/data1.parquet create mode 100644 testdata/join/parquet/fact/d_dkey=D/data0.parquet delete mode 100644 testdata/join/parquet/fact/data_A0.parquet delete mode 100644 testdata/join/parquet/fact/data_B0.parquet delete mode 100644 testdata/join/parquet/fact/data_B1.parquet delete mode 100644 testdata/join/parquet/fact/data_B2.parquet delete mode 100644 testdata/join/parquet/fact/data_C0.parquet delete mode 100644 testdata/join/parquet/fact/data_C1.parquet delete mode 100644 testdata/join/parquet/fact/data_D0.parquet diff --git a/testdata/join/csv/dim/d_dkey=A/data0.csv b/testdata/join/csv/dim/d_dkey=A/data0.csv new file mode 100644 index 00000000..60818a27 --- /dev/null +++ b/testdata/join/csv/dim/d_dkey=A/data0.csv @@ -0,0 +1,3 @@ +env,service,host +dev,log,host-y + diff --git a/testdata/join/csv/dim/d_dkey=B/data0.csv b/testdata/join/csv/dim/d_dkey=B/data0.csv new file mode 100644 index 00000000..98999c08 --- /dev/null +++ b/testdata/join/csv/dim/d_dkey=B/data0.csv @@ -0,0 +1,4 @@ +env,service,host +prod,log,host-x +prod,api,host-x + diff --git a/testdata/join/csv/dim/d_dkey=C/data0.csv b/testdata/join/csv/dim/d_dkey=C/data0.csv new file mode 100644 index 00000000..f34ed438 --- /dev/null +++ b/testdata/join/csv/dim/d_dkey=C/data0.csv @@ -0,0 +1,4 @@ +env,service,host +dev,trace,host-z +prod,log,host-y + diff --git a/testdata/join/csv/dim/d_dkey=D/data0.csv b/testdata/join/csv/dim/d_dkey=D/data0.csv new file mode 100644 index 00000000..5e9e02bd --- /dev/null +++ b/testdata/join/csv/dim/d_dkey=D/data0.csv @@ -0,0 +1,4 @@ +env,service,host +dev,log,host-x +prod,trace,host-z + diff --git a/testdata/join/csv/dim/data_A0.csv b/testdata/join/csv/dim/data_A0.csv deleted file mode 100644 index 9e45781b..00000000 --- a/testdata/join/csv/dim/data_A0.csv +++ /dev/null @@ -1,4 +0,0 @@ -key,col1,col2,col3 -0,a,b,c -1,d,e,f - diff --git a/testdata/join/csv/dim/data_B0.csv b/testdata/join/csv/dim/data_B0.csv deleted file mode 100644 index 085a329c..00000000 --- a/testdata/join/csv/dim/data_B0.csv +++ /dev/null @@ -1,5 +0,0 @@ -key,col1,col2,col3 -2,g,h,i -3,j,k,l -5,m,n,o - diff --git a/testdata/join/csv/dim/data_C0.csv b/testdata/join/csv/dim/data_C0.csv deleted file mode 100644 index aa44a343..00000000 --- a/testdata/join/csv/dim/data_C0.csv +++ /dev/null @@ -1,4 +0,0 @@ -key,col1,col2,col3 -6,p,q,r -8,s,t,u - diff --git a/testdata/join/csv/dim/data_D0.csv b/testdata/join/csv/dim/data_D0.csv deleted file mode 100644 index df52870f..00000000 --- a/testdata/join/csv/dim/data_D0.csv +++ /dev/null @@ -1,6 +0,0 @@ -key,col1,col2,col3 -10,v,w,x -11,y,z,a -15,b,c,d -18,e,f,g - diff --git a/testdata/join/csv/fact/d_dkey=A/data0.csv b/testdata/join/csv/fact/d_dkey=A/data0.csv new file mode 100644 index 00000000..ed765e7d --- /dev/null +++ b/testdata/join/csv/fact/d_dkey=A/data0.csv @@ -0,0 +1,9 @@ +timestamp,value +2023-01-01T09:00:00,95.5 +2023-01-01T09:00:10,102.3 +2023-01-01T09:00:20,98.7 +2023-01-01T09:12:20,105.1 +2023-01-01T09:12:30,100.0 +2023-01-01T09:12:40,150.0 +2023-01-01T09:12:50,120.8 + diff --git a/testdata/join/csv/fact/d_dkey=B/data0.csv b/testdata/join/csv/fact/d_dkey=B/data0.csv new file mode 100644 index 00000000..b2d4c205 --- /dev/null +++ b/testdata/join/csv/fact/d_dkey=B/data0.csv @@ -0,0 +1,9 @@ +timestamp,value +2023-01-01T09:00:00,75.2 +2023-01-01T09:00:10,82.4 +2023-01-01T09:00:20,78.9 +2023-01-01T09:00:30,85.6 +2023-01-01T09:12:30,80.0 +2023-01-01T09:12:40,120.0 +2023-01-01T09:12:50,92.3 + diff --git a/testdata/join/csv/fact/d_dkey=B/data1.csv b/testdata/join/csv/fact/d_dkey=B/data1.csv new file mode 100644 index 00000000..2af219e4 --- /dev/null +++ b/testdata/join/csv/fact/d_dkey=B/data1.csv @@ -0,0 +1,8 @@ +timestamp,value +2023-01-01T10:00:00,88.5 +2023-01-01T10:00:10,91.2 +2023-01-01T10:00:20,87.3 +2023-01-01T10:00:30,94.1 +2023-01-01T10:12:30,89.5 +2023-01-01T10:12:40,95.8 + diff --git a/testdata/join/csv/fact/d_dkey=B/data2.csv b/testdata/join/csv/fact/d_dkey=B/data2.csv new file mode 100644 index 00000000..3f807744 --- /dev/null +++ b/testdata/join/csv/fact/d_dkey=B/data2.csv @@ -0,0 +1,7 @@ +timestamp,value +2023-01-01T11:00:00,72.8 +2023-01-01T11:00:10,79.4 +2023-01-01T11:00:20,76.1 +2023-01-01T11:00:30,83.7 +2023-01-01T11:12:30,77.2 + diff --git a/testdata/join/csv/fact/d_dkey=C/data0.csv b/testdata/join/csv/fact/d_dkey=C/data0.csv new file mode 100644 index 00000000..0f21f8fa --- /dev/null +++ b/testdata/join/csv/fact/d_dkey=C/data0.csv @@ -0,0 +1,9 @@ +timestamp,value +2023-01-01T10:00:00,310.5 +2023-01-01T10:00:10,225.7 +2023-01-01T10:00:20,380.2 +2023-01-01T10:00:30,205.8 +2023-01-01T10:00:40,350.0 +2023-01-01T10:12:40,200.0 +2023-01-01T10:12:50,205.4 + diff --git a/testdata/join/csv/fact/d_dkey=C/data1.csv b/testdata/join/csv/fact/d_dkey=C/data1.csv new file mode 100644 index 00000000..1c1ff5a7 --- /dev/null +++ b/testdata/join/csv/fact/d_dkey=C/data1.csv @@ -0,0 +1,9 @@ +timestamp,value +2023-01-01T11:00:00,295.3 +2023-01-01T11:00:10,318.6 +2023-01-01T11:00:20,342.9 +2023-01-01T11:00:30,287.4 +2023-01-01T11:00:40,365.2 +2023-01-01T11:12:40,310.8 +2023-01-01T11:12:50,298.1 + diff --git a/testdata/join/csv/fact/d_dkey=D/data0.csv b/testdata/join/csv/fact/d_dkey=D/data0.csv new file mode 100644 index 00000000..c15bfc1b --- /dev/null +++ b/testdata/join/csv/fact/d_dkey=D/data0.csv @@ -0,0 +1,5 @@ +timestamp,value +2023-01-01T10:00:00,24.8 +2023-01-01T10:00:10,72.1 +2023-01-01T10:00:20,42.5 + diff --git a/testdata/join/csv/fact/data_A0.csv b/testdata/join/csv/fact/data_A0.csv deleted file mode 100644 index f1355f30..00000000 --- a/testdata/join/csv/fact/data_A0.csv +++ /dev/null @@ -1,4 +0,0 @@ -key,col1,col2,col3 -0,z,y,x -1,w,v,u - diff --git a/testdata/join/csv/fact/data_B0.csv b/testdata/join/csv/fact/data_B0.csv deleted file mode 100644 index 12272e6a..00000000 --- a/testdata/join/csv/fact/data_B0.csv +++ /dev/null @@ -1,5 +0,0 @@ -key,col1,col2,col3 -2,t,s,r -3,q,p,o -3,n,m,l - diff --git a/testdata/join/csv/fact/data_B1.csv b/testdata/join/csv/fact/data_B1.csv deleted file mode 100644 index 3923cef3..00000000 --- a/testdata/join/csv/fact/data_B1.csv +++ /dev/null @@ -1,6 +0,0 @@ -key,col1,col2,col3 -2,h,g,f -3,e,d,c -3,b,a,z -4,y,x,w - diff --git a/testdata/join/csv/fact/data_B2.csv b/testdata/join/csv/fact/data_B2.csv deleted file mode 100644 index 5b4f2a8e..00000000 --- a/testdata/join/csv/fact/data_B2.csv +++ /dev/null @@ -1,3 +0,0 @@ -key,col1,col2,col3 -5,v,u,t - diff --git a/testdata/join/csv/fact/data_C0.csv b/testdata/join/csv/fact/data_C0.csv deleted file mode 100644 index 0d131567..00000000 --- a/testdata/join/csv/fact/data_C0.csv +++ /dev/null @@ -1,4 +0,0 @@ -key,col1,col2,col3 -6,v,u,t -7,s,r,q - diff --git a/testdata/join/csv/fact/data_C1.csv b/testdata/join/csv/fact/data_C1.csv deleted file mode 100644 index b322fdb8..00000000 --- a/testdata/join/csv/fact/data_C1.csv +++ /dev/null @@ -1,4 +0,0 @@ -key,col1,col2,col3 -8,p,o,n -8,m,l,k - diff --git a/testdata/join/csv/fact/data_D0.csv b/testdata/join/csv/fact/data_D0.csv deleted file mode 100644 index 435b4ea9..00000000 --- a/testdata/join/csv/fact/data_D0.csv +++ /dev/null @@ -1,8 +0,0 @@ -key,col1,col2,col3 -9,j,i,h -10,g,f,e -11,d,c,b -11,a,z,y -12,x,w,v -15,u,t,s - diff --git a/testdata/join/generate_parquet_from_csv.sql b/testdata/join/generate_parquet_from_csv.sql index 9023b3b5..d3b27813 100644 --- a/testdata/join/generate_parquet_from_csv.sql +++ b/testdata/join/generate_parquet_from_csv.sql @@ -1,48 +1,48 @@ -- datafusion-cli -f testdata/join/generate_parquet_from_csv.sql -- Generate parquet dim files from csv files. -COPY (SELECT * FROM "testdata/join/csv/dim/data_A0.csv") -TO "testdata/join/parquet/dim/data_A0.parquet" +COPY (SELECT * FROM "testdata/join/csv/dim/d_dkey=A/data0.csv") +TO "testdata/join/parquet/dim/d_dkey=A/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/dim/data_B0.csv") -TO "testdata/join/parquet/dim/data_B0.parquet" +COPY (SELECT * FROM "testdata/join/csv/dim/d_dkey=B/data0.csv") +TO "testdata/join/parquet/dim/d_dkey=B/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/dim/data_C0.csv") -TO "testdata/join/parquet/dim/data_C0.parquet" +COPY (SELECT * FROM "testdata/join/csv/dim/d_dkey=C/data0.csv") +TO "testdata/join/parquet/dim/d_dkey=C/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/dim/data_D0.csv") -TO "testdata/join/parquet/dim/data_D0.parquet" +COPY (SELECT * FROM "testdata/join/csv/dim/d_dkey=D/data0.csv") +TO "testdata/join/parquet/dim/d_dkey=D/data0.parquet" STORED AS PARQUET; -- Generate parquet fact files from csv files. -COPY (SELECT * FROM "testdata/join/csv/fact/data_A0.csv") -TO "testdata/join/parquet/fact/data_A0.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=A/data0.csv") +TO "testdata/join/parquet/fact/d_dkey=A/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/data_B0.csv") -TO "testdata/join/parquet/fact/data_B0.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=B/data0.csv") +TO "testdata/join/parquet/fact/d_dkey=B/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/data_B1.csv") -TO "testdata/join/parquet/fact/data_B1.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=B/data1.csv") +TO "testdata/join/parquet/fact/d_dkey=B/data1.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/data_B2.csv") -TO "testdata/join/parquet/fact/data_B2.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=B/data2.csv") +TO "testdata/join/parquet/fact/d_dkey=B/data2.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/data_C0.csv") -TO "testdata/join/parquet/fact/data_C0.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=C/data0.csv") +TO "testdata/join/parquet/fact/d_dkey=C/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/data_C1.csv") -TO "testdata/join/parquet/fact/data_C1.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=C/data1.csv") +TO "testdata/join/parquet/fact/d_dkey=C/data1.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/data_D0.csv") -TO "testdata/join/parquet/fact/data_D0.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=D/data0.csv") +TO "testdata/join/parquet/fact/d_dkey=D/data0.parquet" STORED AS PARQUET; diff --git a/testdata/join/parquet/dim/d_dkey=A/data0.parquet b/testdata/join/parquet/dim/d_dkey=A/data0.parquet new file mode 100644 index 00000000..6865b9d2 --- /dev/null +++ b/testdata/join/parquet/dim/d_dkey=A/data0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8752b1efcfb3f541d0481e397fb0920060d1b324972228bf38f2c6547838374 +size 1011 diff --git a/testdata/join/parquet/dim/d_dkey=B/data0.parquet b/testdata/join/parquet/dim/d_dkey=B/data0.parquet new file mode 100644 index 00000000..98389be3 --- /dev/null +++ b/testdata/join/parquet/dim/d_dkey=B/data0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c669a99a72f130361c718865d68930b36321e70f8ef0c4e09711ab555a90e610 +size 1024 diff --git a/testdata/join/parquet/dim/d_dkey=C/data0.parquet b/testdata/join/parquet/dim/d_dkey=C/data0.parquet new file mode 100644 index 00000000..fe6b7060 --- /dev/null +++ b/testdata/join/parquet/dim/d_dkey=C/data0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46574b458899c78edab412f8d099df6992bc04fe3cd077c9d95be996294a2cb3 +size 1044 diff --git a/testdata/join/parquet/dim/d_dkey=D/data0.parquet b/testdata/join/parquet/dim/d_dkey=D/data0.parquet new file mode 100644 index 00000000..b2df4542 --- /dev/null +++ b/testdata/join/parquet/dim/d_dkey=D/data0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:725fa8b471231b9afb1476b84844eea020d84c949fe44dab426a52af7566f8b4 +size 1044 diff --git a/testdata/join/parquet/dim/data_A0.parquet b/testdata/join/parquet/dim/data_A0.parquet deleted file mode 100644 index 2ba5dcdc..00000000 --- a/testdata/join/parquet/dim/data_A0.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:78b746b8a11a7a5103145038cc9d1ab3524469dd6a4db11084ca85bdd7c577ec -size 1304 diff --git a/testdata/join/parquet/dim/data_B0.parquet b/testdata/join/parquet/dim/data_B0.parquet deleted file mode 100644 index f239920d..00000000 --- a/testdata/join/parquet/dim/data_B0.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7dcffe93f049bb3517c4db75c228647d167bceca5cdcc2188f8cf5d1ab65f892 -size 1328 diff --git a/testdata/join/parquet/dim/data_C0.parquet b/testdata/join/parquet/dim/data_C0.parquet deleted file mode 100644 index 11ac436c..00000000 --- a/testdata/join/parquet/dim/data_C0.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd7adc8388ba8d6aef8487bc8c234712b4d91dcb02f7a20b174a3beb05d24751 -size 1304 diff --git a/testdata/join/parquet/dim/data_D0.parquet b/testdata/join/parquet/dim/data_D0.parquet deleted file mode 100644 index 3f00302f..00000000 --- a/testdata/join/parquet/dim/data_D0.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d2926b2db48fadba952f5d91d669adf05e034c2b12d20a94106ef4833bc2afe -size 1344 diff --git a/testdata/join/parquet/fact/d_dkey=A/data0.parquet b/testdata/join/parquet/fact/d_dkey=A/data0.parquet new file mode 100644 index 00000000..9bca1622 --- /dev/null +++ b/testdata/join/parquet/fact/d_dkey=A/data0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0afda79871479819092957dbb75d6ed17d6953c059d1fea4cfb9df2aa74a5b3e +size 929 diff --git a/testdata/join/parquet/fact/d_dkey=B/data0.parquet b/testdata/join/parquet/fact/d_dkey=B/data0.parquet new file mode 100644 index 00000000..0aed0d21 --- /dev/null +++ b/testdata/join/parquet/fact/d_dkey=B/data0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1df83afcb61abd11ec6f3e3a75947b7d79cdee64104474234aea1c26c2e553 +size 925 diff --git a/testdata/join/parquet/fact/d_dkey=B/data1.parquet b/testdata/join/parquet/fact/d_dkey=B/data1.parquet new file mode 100644 index 00000000..fba94ef8 --- /dev/null +++ b/testdata/join/parquet/fact/d_dkey=B/data1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:440f0bbac5647184e5420be6366ef443824885dd74989cbde08b2de245a5e359 +size 920 diff --git a/testdata/join/parquet/fact/d_dkey=B/data2.parquet b/testdata/join/parquet/fact/d_dkey=B/data2.parquet new file mode 100644 index 00000000..071ecb89 --- /dev/null +++ b/testdata/join/parquet/fact/d_dkey=B/data2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a71b0c6efe0bf341558e7895db364ea045451729e7ea41957dd25851d2193aa +size 913 diff --git a/testdata/join/parquet/fact/d_dkey=C/data0.parquet b/testdata/join/parquet/fact/d_dkey=C/data0.parquet new file mode 100644 index 00000000..eccbdf4b --- /dev/null +++ b/testdata/join/parquet/fact/d_dkey=C/data0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58669c93aba8f140ff95039a53d4bdb58f87f8cf7d2a0f95a3454ae017ccb1a7 +size 934 diff --git a/testdata/join/parquet/fact/d_dkey=C/data1.parquet b/testdata/join/parquet/fact/d_dkey=C/data1.parquet new file mode 100644 index 00000000..9825ab19 --- /dev/null +++ b/testdata/join/parquet/fact/d_dkey=C/data1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:275f89a315e5131dea43789a21ff5fed2ffec4b8337558a76189c8e390826785 +size 936 diff --git a/testdata/join/parquet/fact/d_dkey=D/data0.parquet b/testdata/join/parquet/fact/d_dkey=D/data0.parquet new file mode 100644 index 00000000..c2e2f417 --- /dev/null +++ b/testdata/join/parquet/fact/d_dkey=D/data0.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:264bf30baa3997fa9cd104e58f42929110d78cc6f8cd7f4b4e9328406c43e429 +size 895 diff --git a/testdata/join/parquet/fact/data_A0.parquet b/testdata/join/parquet/fact/data_A0.parquet deleted file mode 100644 index 8078329e..00000000 --- a/testdata/join/parquet/fact/data_A0.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd2f4be607947d2870b1b8bf83e9aefe8646864efab52bc3768b7b0d2c4e2e1a -size 1304 diff --git a/testdata/join/parquet/fact/data_B0.parquet b/testdata/join/parquet/fact/data_B0.parquet deleted file mode 100644 index 8a1f4cb4..00000000 --- a/testdata/join/parquet/fact/data_B0.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ac1a92ca2adab640d02b6c95374f3b5fa5ef51a6a5391ef30fb36d9767a8f7b -size 1322 diff --git a/testdata/join/parquet/fact/data_B1.parquet b/testdata/join/parquet/fact/data_B1.parquet deleted file mode 100644 index 3e445f28..00000000 --- a/testdata/join/parquet/fact/data_B1.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff883e2d28c950dbf5fbe6cffcc2f4bf4df0b4810e13d2eee9f6f3b4faff3a22 -size 1343 diff --git a/testdata/join/parquet/fact/data_B2.parquet b/testdata/join/parquet/fact/data_B2.parquet deleted file mode 100644 index 0bc3e1b7..00000000 --- a/testdata/join/parquet/fact/data_B2.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:067a64494be0b3f670fe1c8c44af71413daab4992aaa237fb9c089d88f47c15b -size 1274 diff --git a/testdata/join/parquet/fact/data_C0.parquet b/testdata/join/parquet/fact/data_C0.parquet deleted file mode 100644 index 02c7a110..00000000 --- a/testdata/join/parquet/fact/data_C0.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:67ca4367d7c20feb7b7e6ec14bf28f07566bb5bc6ab540722c171ec4e7900da2 -size 1304 diff --git a/testdata/join/parquet/fact/data_C1.parquet b/testdata/join/parquet/fact/data_C1.parquet deleted file mode 100644 index 6812fa76..00000000 --- a/testdata/join/parquet/fact/data_C1.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23554d4ad9cb220fbf225dad268153b4d0b4649d61f81f0f5eca174366517909 -size 1295 diff --git a/testdata/join/parquet/fact/data_D0.parquet b/testdata/join/parquet/fact/data_D0.parquet deleted file mode 100644 index 56966642..00000000 --- a/testdata/join/parquet/fact/data_D0.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bac11a954def357d8758ce5211e51a6e6321c29eeabbf3d91ecf5a2fc129493e -size 1383 diff --git a/tests/join.rs b/tests/join.rs index 9c13029a..edd10741 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -2,12 +2,21 @@ mod tests { use arrow::util::pretty; use datafusion::{ + assert_batches_eq, physical_plan::{collect, displayable}, prelude::{ParquetReadOptions, SessionContext}, }; + use datafusion_distributed::{ + DefaultSessionBuilder, display_plan_ascii, + test_utils::localhost::start_localhost_context, + }; #[tokio::test] - async fn test_join() -> Result<(), Box> { + async fn test_join_distributed() -> Result<(), Box> { + + let query = "SELECT * FROM dim JOIN fact ON dim.key = fact.key ORDER BY dim.key, dim.col1, fact.col1"; + + // Execute the query using single node datafusion. let ctx = SessionContext::new(); ctx.register_parquet( "dim", @@ -22,16 +31,61 @@ mod tests { ) .await?; - let sql = "SELECT * FROM dim JOIN fact ON dim.key = fact.key"; - let df = ctx.sql(sql).await?; + let df = ctx.sql(query).await?; let (state, logical_plan) = df.into_parts(); let physical_plan = state.create_physical_plan(&logical_plan).await?; - println!("\n——————— PHYSICAL PLAN ———————\n"); + // println!("\n——————— PHYSICAL PLAN ———————\n"); println!("{}", displayable(physical_plan.as_ref()).indent(true)); - let result = collect(physical_plan, state.task_ctx()).await?; - pretty::print_batches(&result)?; + let non_distributed_result = collect(physical_plan, state.task_ctx()).await?; + pretty::print_batches(&non_distributed_result)?; + + // Execute the query using distributed datafusion. + let (distributed_ctx, _guard) = start_localhost_context(4, DefaultSessionBuilder).await; + distributed_ctx + .register_parquet( + "dim", + "testdata/join/parquet/dim", + ParquetReadOptions::new(), + ) + .await?; + distributed_ctx + .register_parquet( + "fact", + "testdata/join/parquet/fact", + ParquetReadOptions::new(), + ) + .await?; + + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold = 0; + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold_rows = 0; + + let df = distributed_ctx.sql(query).await?; + + let (state, logical_plan) = df.into_parts(); + let physical_plan = state.create_physical_plan(&logical_plan).await?; + println!("\n——————— DISTRIBUTED PLAN ———————\n"); + println!("{}", display_plan_ascii(physical_plan.as_ref(), false)); + + let distributed_result = collect(physical_plan, state.task_ctx()).await?; + pretty::print_batches(&distributed_result)?; + + // Compare single-node and distributed results. + assert_eq!(non_distributed_result, distributed_result); + Ok(()) } } From cab524b77127166b2223eedf33539222f55fe4d7 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Thu, 18 Dec 2025 16:04:08 -0500 Subject: [PATCH 04/14] Results are the same with hive partitioning and Gene's PR. --- Cargo.lock | 309 ++++++------------ Cargo.toml | 11 +- src/metrics/proto.rs | 2 +- src/protobuf/errors/datafusion_error.rs | 3 + testdata/join/csv/dim/d_dkey=B/data0.csv | 1 - testdata/join/csv/dim/d_dkey=C/data0.csv | 1 - testdata/join/csv/dim/d_dkey=D/data0.csv | 3 +- .../csv/fact/{d_dkey=A => f_dkey=A}/data0.csv | 0 .../csv/fact/{d_dkey=B => f_dkey=B}/data0.csv | 0 .../csv/fact/{d_dkey=B => f_dkey=B}/data1.csv | 0 .../csv/fact/{d_dkey=B => f_dkey=B}/data2.csv | 0 .../csv/fact/{d_dkey=C => f_dkey=C}/data0.csv | 0 .../csv/fact/{d_dkey=C => f_dkey=C}/data1.csv | 0 .../csv/fact/{d_dkey=D => f_dkey=D}/data0.csv | 0 testdata/join/generate_parquet_from_csv.sql | 28 +- .../join/parquet/dim/d_dkey=B/data0.parquet | 4 +- .../join/parquet/dim/d_dkey=C/data0.parquet | 4 +- .../join/parquet/dim/d_dkey=D/data0.parquet | 4 +- .../fact/{d_dkey=A => f_dkey=A}/data0.parquet | 0 .../fact/{d_dkey=B => f_dkey=B}/data0.parquet | 0 .../fact/{d_dkey=B => f_dkey=B}/data1.parquet | 0 .../fact/{d_dkey=B => f_dkey=B}/data2.parquet | 0 .../fact/{d_dkey=C => f_dkey=C}/data0.parquet | 0 .../fact/{d_dkey=C => f_dkey=C}/data1.parquet | 0 .../fact/{d_dkey=D => f_dkey=D}/data0.parquet | 0 tests/join.rs | 114 ++++--- 26 files changed, 197 insertions(+), 287 deletions(-) rename testdata/join/csv/fact/{d_dkey=A => f_dkey=A}/data0.csv (100%) rename testdata/join/csv/fact/{d_dkey=B => f_dkey=B}/data0.csv (100%) rename testdata/join/csv/fact/{d_dkey=B => f_dkey=B}/data1.csv (100%) rename testdata/join/csv/fact/{d_dkey=B => f_dkey=B}/data2.csv (100%) rename testdata/join/csv/fact/{d_dkey=C => f_dkey=C}/data0.csv (100%) rename testdata/join/csv/fact/{d_dkey=C => f_dkey=C}/data1.csv (100%) rename testdata/join/csv/fact/{d_dkey=D => f_dkey=D}/data0.csv (100%) rename testdata/join/parquet/fact/{d_dkey=A => f_dkey=A}/data0.parquet (100%) rename testdata/join/parquet/fact/{d_dkey=B => f_dkey=B}/data0.parquet (100%) rename testdata/join/parquet/fact/{d_dkey=B => f_dkey=B}/data1.parquet (100%) rename testdata/join/parquet/fact/{d_dkey=B => f_dkey=B}/data2.parquet (100%) rename testdata/join/parquet/fact/{d_dkey=C => f_dkey=C}/data0.parquet (100%) rename testdata/join/parquet/fact/{d_dkey=C => f_dkey=C}/data1.parquet (100%) rename testdata/join/parquet/fact/{d_dkey=D => f_dkey=D}/data0.parquet (100%) diff --git a/Cargo.lock b/Cargo.lock index fc93ec9b..d76789cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -128,9 +128,9 @@ checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] name = "arrow" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4df8bb5b0bd64c0b9bc61317fcc480bad0f00e56d3bc32c69a4c8dada4786bae" +checksum = "cb372a7cbcac02a35d3fb7b3fc1f969ec078e871f9bb899bf00a2e1809bec8a3" dependencies = [ "arrow-arith", "arrow-array", @@ -149,9 +149,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1a640186d3bd30a24cb42264c2dafb30e236a6f50d510e56d40b708c9582491" +checksum = "0f377dcd19e440174596d83deb49cd724886d91060c07fec4f67014ef9d54049" dependencies = [ "arrow-array", "arrow-buffer", @@ -163,9 +163,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219fe420e6800979744c8393b687afb0252b3f8a89b91027d27887b72aa36d31" +checksum = "a23eaff85a44e9fa914660fb0d0bb00b79c4a3d888b5334adb3ea4330c84f002" dependencies = [ "ahash", "arrow-buffer", @@ -182,9 +182,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76885a2697a7edf6b59577f568b456afc94ce0e2edc15b784ce3685b6c3c5c27" +checksum = "a2819d893750cb3380ab31ebdc8c68874dd4429f90fd09180f3c93538bd21626" dependencies = [ "bytes", "half", @@ -194,13 +194,14 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c9ebb4c987e6b3b236fb4a14b20b34835abfdd80acead3ccf1f9bf399e1f168" +checksum = "e3d131abb183f80c450d4591dc784f8d7750c50c6e2bc3fcaad148afc8361271" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", + "arrow-ord", "arrow-schema", "arrow-select", "atoi", @@ -215,9 +216,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92386159c8d4bce96f8bd396b0642a0d544d471bdc2ef34d631aec80db40a09c" +checksum = "2275877a0e5e7e7c76954669366c2aa1a829e340ab1f612e647507860906fb6b" dependencies = [ "arrow-array", "arrow-cast", @@ -230,9 +231,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "727681b95de313b600eddc2a37e736dcb21980a40f640314dcf360e2f36bc89b" +checksum = "05738f3d42cb922b9096f7786f606fcb8669260c2640df8490533bb2fa38c9d3" dependencies = [ "arrow-buffer", "arrow-schema", @@ -263,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9ba92e3de170295c98a84e5af22e2b037f0c7b32449445e6c493b5fca27f27" +checksum = "3d09446e8076c4b3f235603d9ea7c5494e73d441b01cd61fb33d7254c11964b3" dependencies = [ "arrow-array", "arrow-buffer", @@ -278,9 +279,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b969b4a421ae83828591c6bf5450bd52e6d489584142845ad6a861f42fe35df8" +checksum = "371ffd66fa77f71d7628c63f209c9ca5341081051aa32f9c8020feb0def787c0" dependencies = [ "arrow-array", "arrow-buffer", @@ -302,9 +303,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "141c05298b21d03e88062317a1f1a73f5ba7b6eb041b350015b1cd6aabc0519b" +checksum = "cbc94fc7adec5d1ba9e8cd1b1e8d6f72423b33fe978bf1f46d970fafab787521" dependencies = [ "arrow-array", "arrow-buffer", @@ -315,9 +316,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f3c06a6abad6164508ed283c7a02151515cef3de4b4ff2cebbcaeb85533db2" +checksum = "169676f317157dc079cc5def6354d16db63d8861d61046d2f3883268ced6f99f" dependencies = [ "arrow-array", "arrow-buffer", @@ -328,9 +329,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cfa7a03d1eee2a4d061476e1840ad5c9867a544ca6c4c59256496af5d0a8be5" +checksum = "d27609cd7dd45f006abae27995c2729ef6f4b9361cde1ddd019dc31a5aa017e0" dependencies = [ "serde_core", "serde_json", @@ -338,9 +339,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bafa595babaad59f2455f4957d0f26448fb472722c186739f4fac0823a1bdb47" +checksum = "ae980d021879ea119dd6e2a13912d81e64abed372d53163e804dfe84639d8010" dependencies = [ "ahash", "arrow-array", @@ -352,9 +353,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f46457dbbb99f2650ff3ac23e46a929e0ab81db809b02aa5511c258348bef2" +checksum = "cf35e8ef49dcf0c5f6d175edee6b8af7b45611805333129c541a8b89a0fc0534" dependencies = [ "arrow-array", "arrow-buffer", @@ -375,7 +376,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -1217,8 +1218,7 @@ dependencies = [ [[package]] name = "datafusion" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ba7cb113e9c0bedf9e9765926031e132fa05a1b09ba6e93a6d1a4d7044457b8" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "arrow-schema", @@ -1258,7 +1258,6 @@ dependencies = [ "parquet", "rand 0.9.2", "regex", - "rstest", "sqlparser", "tempfile", "tokio", @@ -1269,8 +1268,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a3a799f914a59b1ea343906a0486f17061f39509af74e874a866428951130d" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", @@ -1294,8 +1292,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db1b113c80d7a0febcd901476a57aef378e717c54517a163ed51417d87621b0" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", @@ -1312,14 +1309,12 @@ dependencies = [ "itertools", "log", "object_store", - "tokio", ] [[package]] name = "datafusion-common" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c10f7659e96127d25e8366be7c8be4109595d6a2c3eac70421f380a7006a1b0" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "ahash", "arrow", @@ -1341,8 +1336,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b92065bbc6532c6651e2f7dd30b55cba0c7a14f860c7e1d15f165c41a1868d95" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "futures", "log", @@ -1352,8 +1346,7 @@ dependencies = [ [[package]] name = "datafusion-datasource" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde13794244bc7581cd82f6fff217068ed79cdc344cafe4ab2c3a1c3510b38d6" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", @@ -1381,8 +1374,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804fa9b4ecf3157982021770617200ef7c1b2979d57bec9044748314775a9aea" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "arrow-ipc", @@ -1405,8 +1397,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a1641a40b259bab38131c5e6f48fac0717bedb7dc93690e604142a849e0568" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", @@ -1428,8 +1419,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adeacdb00c1d37271176f8fb6a1d8ce096baba16ea7a4b2671840c5c9c64fe85" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", @@ -1450,8 +1440,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d0b60ffd66f28bfb026565d62b0a6cbc416da09814766a3797bba7d85a3cd9" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", @@ -1544,17 +1533,16 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b99e13947667b36ad713549237362afb054b2d8f8cc447751e23ec61202db07" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" [[package]] name = "datafusion-execution" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63695643190679037bc946ad46a263b62016931547bf119859c511f7ff2f5178" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", + "chrono", "dashmap", "datafusion-common", "datafusion-expr", @@ -1570,8 +1558,7 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a4787cbf5feb1ab351f789063398f67654a6df75c4d37d7f637dc96f951a91" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", @@ -1592,8 +1579,7 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce2fb1b8c15c9ac45b0863c30b268c69dc9ee7a1ee13ecf5d067738338173dc" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "datafusion-common", @@ -1605,8 +1591,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "794a9db7f7b96b3346fc007ff25e994f09b8f0511b4cf7dff651fadfe3ebb28f" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "arrow-buffer", @@ -1631,8 +1616,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c25210520a9dcf9c2b2cbbce31ebd4131ef5af7fc60ee92b266dc7d159cb305" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "ahash", "arrow", @@ -1652,8 +1636,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f4a66f3b87300bb70f4124b55434d2ae3fe80455f3574701d0348da040b55d" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "ahash", "arrow", @@ -1665,8 +1648,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae5c06eed03918dc7fe7a9f082a284050f0e9ecf95d72f57712d1496da03b8c4" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "arrow-ord", @@ -1688,8 +1670,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4fed1d71738fbe22e2712d71396db04c25de4111f1ec252b8f4c6d3b25d7f5" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "async-trait", @@ -1704,8 +1685,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d92206aa5ae21892f1552b4d61758a862a70956e6fd7a95cb85db1de74bc6d1" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "datafusion-common", @@ -1722,8 +1702,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53ae9bcc39800820d53a22d758b3b8726ff84a5a3e24cecef04ef4e5fdf1c7cc" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1732,19 +1711,17 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1063ad4c9e094b3f798acee16d9a47bd7372d9699be2de21b05c3bd3f34ab848" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] name = "datafusion-optimizer" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35f9ec5d08b87fd1893a30c2929f2559c2f9806ca072d8fefca5009dc0f06a" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "chrono", @@ -1762,8 +1739,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c30cc8012e9eedcb48bbe112c6eff4ae5ed19cf3003cb0f505662e88b7014c5d" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "ahash", "arrow", @@ -1779,13 +1755,13 @@ dependencies = [ "parking_lot", "paste", "petgraph", + "tokio", ] [[package]] name = "datafusion-physical-expr-adapter" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9ff2dbd476221b1f67337699eff432781c4e6e1713d2aefdaa517dfbf79768" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "datafusion-common", @@ -1799,8 +1775,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90da43e1ec550b172f34c87ec68161986ced70fd05c8d2a2add66eef9c276f03" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "ahash", "arrow", @@ -1813,8 +1788,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce9804f799acd7daef3be7aaffe77c0033768ed8fdbf5fb82fc4c5f2e6bc14e6" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "datafusion-common", @@ -1831,19 +1805,18 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0acf0ad6b6924c6b1aa7d213b181e012e2d3ec0a64ff5b10ee6282ab0f8532ac" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "ahash", "arrow", "arrow-ord", "arrow-schema", "async-trait", - "chrono", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions", "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", @@ -1862,8 +1835,7 @@ dependencies = [ [[package]] name = "datafusion-proto" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d368093a98a17d1449b1083ac22ed16b7128e4c67789991869480d8c4a40ecb9" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "chrono", @@ -1889,8 +1861,7 @@ dependencies = [ [[package]] name = "datafusion-proto-common" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b6aef3d5e5c1d2bc3114c4876730cb76a9bdc5a8df31ef1b6db48f0c1671895" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "datafusion-common", @@ -1900,8 +1871,7 @@ dependencies = [ [[package]] name = "datafusion-pruning" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac2c2498a1f134a9e11a9f5ed202a2a7d7e9774bd9249295593053ea3be999db" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "datafusion-common", @@ -1917,8 +1887,7 @@ dependencies = [ [[package]] name = "datafusion-session" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f96eebd17555386f459037c65ab73aae8df09f464524c709d6a3134ad4f4776" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "async-trait", "datafusion-common", @@ -1931,8 +1900,7 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "51.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fc195fe60634b2c6ccfd131b487de46dc30eccae8a3c35a13f136e7f440414f" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" dependencies = [ "arrow", "bigdecimal", @@ -1953,7 +1921,7 @@ checksum = "780eb241654bf097afb00fc5f054a09b687dad862e485fdcf8399bb056565370" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -1990,7 +1958,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -2160,7 +2128,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -2175,12 +2143,6 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" -[[package]] -name = "futures-timer" -version = "3.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" - [[package]] name = "futures-util" version = "0.3.31" @@ -2753,7 +2715,7 @@ checksum = "980af8b43c3ad5d8d349ace167ec8170839f753a42d233ba19e08afe1850fa69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -2895,9 +2857,9 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" -version = "0.11.5" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" dependencies = [ "twox-hash", ] @@ -3095,9 +3057,9 @@ dependencies = [ [[package]] name = "parquet" -version = "57.0.0" +version = "57.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0f31027ef1af7549f7cec603a9a21dce706d3f8d7c2060a68f43c1773be95a" +checksum = "be3e4f6d320dd92bfa7d612e265d7d08bba0a240bab86af3425e1d255a511d89" dependencies = [ "ahash", "arrow-array", @@ -3189,7 +3151,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -3259,15 +3221,6 @@ dependencies = [ "yansi", ] -[[package]] -name = "proc-macro-crate" -version = "3.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" -dependencies = [ - "toml_edit", -] - [[package]] name = "proc-macro-error" version = "1.0.4" @@ -3321,7 +3274,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -3516,12 +3469,6 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" -[[package]] -name = "relative-path" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" - [[package]] name = "reqwest" version = "0.12.24" @@ -3578,35 +3525,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "rstest" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5a3193c063baaa2a95a33f03035c8a72b83d97a54916055ba22d35ed3839d49" -dependencies = [ - "futures-timer", - "futures-util", - "rstest_macros", -] - -[[package]] -name = "rstest_macros" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c845311f0ff7951c5506121a9ad75aec44d083c31583b2ea5a30bcb0b0abba0" -dependencies = [ - "cfg-if", - "glob", - "proc-macro-crate", - "proc-macro2", - "quote", - "regex", - "relative-path", - "rustc_version", - "syn 2.0.110", - "unicode-ident", -] - [[package]] name = "rustc-hash" version = "2.1.1" @@ -3846,7 +3764,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -3991,7 +3909,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -4049,9 +3967,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.110" +version = "2.0.111" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" +checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" dependencies = [ "proc-macro2", "quote", @@ -4075,7 +3993,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -4117,7 +4035,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -4220,7 +4138,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -4267,36 +4185,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "toml_datetime" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" -dependencies = [ - "serde_core", -] - -[[package]] -name = "toml_edit" -version = "0.23.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6485ef6d0d9b5d0ec17244ff7eb05310113c3f316f2d14200d4de56b3cb98f8d" -dependencies = [ - "indexmap", - "toml_datetime", - "toml_parser", - "winnow", -] - -[[package]] -name = "toml_parser" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" -dependencies = [ - "winnow", -] - [[package]] name = "tonic" version = "0.14.2" @@ -4420,7 +4308,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -4512,9 +4400,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.18.1" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" +checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a" dependencies = [ "getrandom 0.3.4", "js-sys", @@ -4618,7 +4506,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", "wasm-bindgen-shared", ] @@ -4716,7 +4604,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -4727,7 +4615,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -4919,15 +4807,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" -[[package]] -name = "winnow" -version = "0.7.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" -dependencies = [ - "memchr", -] - [[package]] name = "wit-bindgen" version = "0.46.0" @@ -4971,7 +4850,7 @@ checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", "synstructure", ] @@ -4992,7 +4871,7 @@ checksum = "c640b22cd9817fae95be82f0d2f90b11f7605f6c319d16705c459b27ac2cbc26" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] @@ -5012,7 +4891,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", "synstructure", ] @@ -5052,7 +4931,7 @@ checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.110", + "syn 2.0.111", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 16db0654..c631e700 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,8 +2,13 @@ members = ["benchmarks"] [workspace.dependencies] -datafusion = { version = "51.0.0", default-features = false } -datafusion-proto = { version = "51.0.0" } +# Using PR #19304: Hash partitioning satisfies subset +# https://github.com/apache/datafusion/pull/19304 +# This PR includes: +# - Hash subset satisfaction logic (eliminates unnecessary repartitions) +# - File partitioning preservation (from PR #19124) +datafusion = { git = "https://github.com/gene-bordegaray/datafusion.git", branch = "gene.bordegaray/2025/12/hash_partitioning_satisfies_subset", default-features = false } +datafusion-proto = { git = "https://github.com/gene-bordegaray/datafusion.git", branch = "gene.bordegaray/2025/12/hash_partitioning_satisfies_subset" } [package] name = "datafusion-distributed" @@ -68,7 +73,7 @@ insta = { version = "1.43.1", features = ["filters"] } tpchgen = { git = "https://github.com/clflushopt/tpchgen-rs", rev = "e83365a5a9101906eb9f78c5607b83bc59849acf" } tpchgen-arrow = { git = "https://github.com/clflushopt/tpchgen-rs", rev = "e83365a5a9101906eb9f78c5607b83bc59849acf" } parquet = "57.0.0" -arrow = "57.0.0" +arrow = "57.0.1" tokio-stream = "0.1.17" hyper-util = "0.1.16" pretty_assertions = "1.4" diff --git a/src/metrics/proto.rs b/src/metrics/proto.rs index bbfcf66b..b467baea 100644 --- a/src/metrics/proto.rs +++ b/src/metrics/proto.rs @@ -288,7 +288,7 @@ pub fn df_metric_to_proto(metric: Arc) -> Result internal_err!("{}", CUSTOM_METRICS_NOT_SUPPORTED), - MetricValue::OutputBytes(_) | MetricValue::PruningMetrics { .. } | MetricValue::Ratio { .. } => { + MetricValue::OutputBytes(_) | MetricValue::PruningMetrics { .. } | MetricValue::Ratio { .. } | MetricValue::OutputBatches(_) => { // TODO: Support these metrics internal_err!("{}", UNSUPPORTED_METRICS) } diff --git a/src/protobuf/errors/datafusion_error.rs b/src/protobuf/errors/datafusion_error.rs index 36095718..4039835a 100644 --- a/src/protobuf/errors/datafusion_error.rs +++ b/src/protobuf/errors/datafusion_error.rs @@ -173,6 +173,9 @@ impl DataFusionErrorProto { DataFusionErrorProto::from_datafusion_error(err.as_ref()), ))), }, + DataFusionError::Ffi(err) => DataFusionErrorProto { + inner: Some(DataFusionErrorInnerProto::Plan(err.clone())), + }, } } diff --git a/testdata/join/csv/dim/d_dkey=B/data0.csv b/testdata/join/csv/dim/d_dkey=B/data0.csv index 98999c08..6a883d21 100644 --- a/testdata/join/csv/dim/d_dkey=B/data0.csv +++ b/testdata/join/csv/dim/d_dkey=B/data0.csv @@ -1,4 +1,3 @@ env,service,host prod,log,host-x -prod,api,host-x diff --git a/testdata/join/csv/dim/d_dkey=C/data0.csv b/testdata/join/csv/dim/d_dkey=C/data0.csv index f34ed438..0c8f0cfd 100644 --- a/testdata/join/csv/dim/d_dkey=C/data0.csv +++ b/testdata/join/csv/dim/d_dkey=C/data0.csv @@ -1,4 +1,3 @@ env,service,host dev,trace,host-z -prod,log,host-y diff --git a/testdata/join/csv/dim/d_dkey=D/data0.csv b/testdata/join/csv/dim/d_dkey=D/data0.csv index 5e9e02bd..9ea19d80 100644 --- a/testdata/join/csv/dim/d_dkey=D/data0.csv +++ b/testdata/join/csv/dim/d_dkey=D/data0.csv @@ -1,4 +1,3 @@ env,service,host -dev,log,host-x -prod,trace,host-z +prod,trace,host-x diff --git a/testdata/join/csv/fact/d_dkey=A/data0.csv b/testdata/join/csv/fact/f_dkey=A/data0.csv similarity index 100% rename from testdata/join/csv/fact/d_dkey=A/data0.csv rename to testdata/join/csv/fact/f_dkey=A/data0.csv diff --git a/testdata/join/csv/fact/d_dkey=B/data0.csv b/testdata/join/csv/fact/f_dkey=B/data0.csv similarity index 100% rename from testdata/join/csv/fact/d_dkey=B/data0.csv rename to testdata/join/csv/fact/f_dkey=B/data0.csv diff --git a/testdata/join/csv/fact/d_dkey=B/data1.csv b/testdata/join/csv/fact/f_dkey=B/data1.csv similarity index 100% rename from testdata/join/csv/fact/d_dkey=B/data1.csv rename to testdata/join/csv/fact/f_dkey=B/data1.csv diff --git a/testdata/join/csv/fact/d_dkey=B/data2.csv b/testdata/join/csv/fact/f_dkey=B/data2.csv similarity index 100% rename from testdata/join/csv/fact/d_dkey=B/data2.csv rename to testdata/join/csv/fact/f_dkey=B/data2.csv diff --git a/testdata/join/csv/fact/d_dkey=C/data0.csv b/testdata/join/csv/fact/f_dkey=C/data0.csv similarity index 100% rename from testdata/join/csv/fact/d_dkey=C/data0.csv rename to testdata/join/csv/fact/f_dkey=C/data0.csv diff --git a/testdata/join/csv/fact/d_dkey=C/data1.csv b/testdata/join/csv/fact/f_dkey=C/data1.csv similarity index 100% rename from testdata/join/csv/fact/d_dkey=C/data1.csv rename to testdata/join/csv/fact/f_dkey=C/data1.csv diff --git a/testdata/join/csv/fact/d_dkey=D/data0.csv b/testdata/join/csv/fact/f_dkey=D/data0.csv similarity index 100% rename from testdata/join/csv/fact/d_dkey=D/data0.csv rename to testdata/join/csv/fact/f_dkey=D/data0.csv diff --git a/testdata/join/generate_parquet_from_csv.sql b/testdata/join/generate_parquet_from_csv.sql index d3b27813..7137ff6d 100644 --- a/testdata/join/generate_parquet_from_csv.sql +++ b/testdata/join/generate_parquet_from_csv.sql @@ -18,31 +18,31 @@ TO "testdata/join/parquet/dim/d_dkey=D/data0.parquet" STORED AS PARQUET; -- Generate parquet fact files from csv files. -COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=A/data0.csv") -TO "testdata/join/parquet/fact/d_dkey=A/data0.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=A/data0.csv") +TO "testdata/join/parquet/fact/f_dkey=A/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=B/data0.csv") -TO "testdata/join/parquet/fact/d_dkey=B/data0.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=B/data0.csv") +TO "testdata/join/parquet/fact/f_dkey=B/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=B/data1.csv") -TO "testdata/join/parquet/fact/d_dkey=B/data1.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=B/data1.csv") +TO "testdata/join/parquet/fact/f_dkey=B/data1.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=B/data2.csv") -TO "testdata/join/parquet/fact/d_dkey=B/data2.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=B/data2.csv") +TO "testdata/join/parquet/fact/f_dkey=B/data2.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=C/data0.csv") -TO "testdata/join/parquet/fact/d_dkey=C/data0.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=C/data0.csv") +TO "testdata/join/parquet/fact/f_dkey=C/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=C/data1.csv") -TO "testdata/join/parquet/fact/d_dkey=C/data1.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=C/data1.csv") +TO "testdata/join/parquet/fact/f_dkey=C/data1.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/d_dkey=D/data0.csv") -TO "testdata/join/parquet/fact/d_dkey=D/data0.parquet" +COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=D/data0.csv") +TO "testdata/join/parquet/fact/f_dkey=D/data0.parquet" STORED AS PARQUET; diff --git a/testdata/join/parquet/dim/d_dkey=B/data0.parquet b/testdata/join/parquet/dim/d_dkey=B/data0.parquet index 98389be3..5d114c63 100644 --- a/testdata/join/parquet/dim/d_dkey=B/data0.parquet +++ b/testdata/join/parquet/dim/d_dkey=B/data0.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c669a99a72f130361c718865d68930b36321e70f8ef0c4e09711ab555a90e610 -size 1024 +oid sha256:0788861c3959ba7ac5f14d3cfbb9ecfa64fd2e823ea08d23a9b7da65255794d3 +size 1016 diff --git a/testdata/join/parquet/dim/d_dkey=C/data0.parquet b/testdata/join/parquet/dim/d_dkey=C/data0.parquet index fe6b7060..f1451fac 100644 --- a/testdata/join/parquet/dim/d_dkey=C/data0.parquet +++ b/testdata/join/parquet/dim/d_dkey=C/data0.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46574b458899c78edab412f8d099df6992bc04fe3cd077c9d95be996294a2cb3 -size 1044 +oid sha256:0cf9e37dc5368a9de0badc5a6392e949e942d96d9e25ddd98217544e2629995c +size 1021 diff --git a/testdata/join/parquet/dim/d_dkey=D/data0.parquet b/testdata/join/parquet/dim/d_dkey=D/data0.parquet index b2df4542..ecfeae2a 100644 --- a/testdata/join/parquet/dim/d_dkey=D/data0.parquet +++ b/testdata/join/parquet/dim/d_dkey=D/data0.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:725fa8b471231b9afb1476b84844eea020d84c949fe44dab426a52af7566f8b4 -size 1044 +oid sha256:b4531e8da8141539997e09816af7e1c6090c90a9bcf36ad308973b6029608703 +size 1026 diff --git a/testdata/join/parquet/fact/d_dkey=A/data0.parquet b/testdata/join/parquet/fact/f_dkey=A/data0.parquet similarity index 100% rename from testdata/join/parquet/fact/d_dkey=A/data0.parquet rename to testdata/join/parquet/fact/f_dkey=A/data0.parquet diff --git a/testdata/join/parquet/fact/d_dkey=B/data0.parquet b/testdata/join/parquet/fact/f_dkey=B/data0.parquet similarity index 100% rename from testdata/join/parquet/fact/d_dkey=B/data0.parquet rename to testdata/join/parquet/fact/f_dkey=B/data0.parquet diff --git a/testdata/join/parquet/fact/d_dkey=B/data1.parquet b/testdata/join/parquet/fact/f_dkey=B/data1.parquet similarity index 100% rename from testdata/join/parquet/fact/d_dkey=B/data1.parquet rename to testdata/join/parquet/fact/f_dkey=B/data1.parquet diff --git a/testdata/join/parquet/fact/d_dkey=B/data2.parquet b/testdata/join/parquet/fact/f_dkey=B/data2.parquet similarity index 100% rename from testdata/join/parquet/fact/d_dkey=B/data2.parquet rename to testdata/join/parquet/fact/f_dkey=B/data2.parquet diff --git a/testdata/join/parquet/fact/d_dkey=C/data0.parquet b/testdata/join/parquet/fact/f_dkey=C/data0.parquet similarity index 100% rename from testdata/join/parquet/fact/d_dkey=C/data0.parquet rename to testdata/join/parquet/fact/f_dkey=C/data0.parquet diff --git a/testdata/join/parquet/fact/d_dkey=C/data1.parquet b/testdata/join/parquet/fact/f_dkey=C/data1.parquet similarity index 100% rename from testdata/join/parquet/fact/d_dkey=C/data1.parquet rename to testdata/join/parquet/fact/f_dkey=C/data1.parquet diff --git a/testdata/join/parquet/fact/d_dkey=D/data0.parquet b/testdata/join/parquet/fact/f_dkey=D/data0.parquet similarity index 100% rename from testdata/join/parquet/fact/d_dkey=D/data0.parquet rename to testdata/join/parquet/fact/f_dkey=D/data0.parquet diff --git a/tests/join.rs b/tests/join.rs index edd10741..80cb6e45 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -1,35 +1,76 @@ -#[cfg(all(feature = "integration", test))] +#[cfg(test)] mod tests { - use arrow::util::pretty; + use arrow::{datatypes::DataType, util::pretty}; use datafusion::{ - assert_batches_eq, physical_plan::{collect, displayable}, prelude::{ParquetReadOptions, SessionContext}, }; use datafusion_distributed::{ - DefaultSessionBuilder, display_plan_ascii, - test_utils::localhost::start_localhost_context, + DefaultSessionBuilder, display_plan_ascii, test_utils::localhost::start_localhost_context, }; + fn set_optimizer_settings(ctx: &SessionContext) { + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold = 0; + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold_rows = 0; + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .preserve_file_partitions; + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold = 0; + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold_rows = 0; + } + #[tokio::test] async fn test_join_distributed() -> Result<(), Box> { - - let query = "SELECT * FROM dim JOIN fact ON dim.key = fact.key ORDER BY dim.key, dim.col1, fact.col1"; + let query = r#" + SELECT + f.f_dkey, + f.timestamp, + f.value, + d.env, + d.service, + d.host + FROM dim d + INNER JOIN fact f ON d.d_dkey = f.f_dkey + ORDER BY f.f_dkey, f.timestamp + "#; // Execute the query using single node datafusion. let ctx = SessionContext::new(); - ctx.register_parquet( - "dim", - "testdata/join/parquet/dim", - ParquetReadOptions::new(), - ) - .await?; - ctx.register_parquet( - "fact", - "testdata/join/parquet/fact", - ParquetReadOptions::new(), - ) - .await?; + + // Register hive-style partitioning for the dim table. + let dim_options = ParquetReadOptions::default() + .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); + ctx.register_parquet("dim", "testdata/join/parquet/dim", dim_options) + .await?; + + // Register hive-style partitioning for the fact table. + let fact_options = ParquetReadOptions::default() + .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]); + ctx.register_parquet("fact", "testdata/join/parquet/fact", fact_options) + .await?; let df = ctx.sql(query).await?; @@ -43,35 +84,21 @@ mod tests { // Execute the query using distributed datafusion. let (distributed_ctx, _guard) = start_localhost_context(4, DefaultSessionBuilder).await; + // Register hive-style partitioning for the dim table. + let dim_options = ParquetReadOptions::default() + .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); distributed_ctx - .register_parquet( - "dim", - "testdata/join/parquet/dim", - ParquetReadOptions::new(), - ) + .register_parquet("dim", "testdata/join/parquet/dim", dim_options) .await?; + + // Register hive-style partitioning for the fact table. + let fact_options = ParquetReadOptions::default() + .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]); distributed_ctx - .register_parquet( - "fact", - "testdata/join/parquet/fact", - ParquetReadOptions::new(), - ) + .register_parquet("fact", "testdata/join/parquet/fact", fact_options) .await?; - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold = 0; - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold_rows = 0; + set_optimizer_settings(&distributed_ctx); let df = distributed_ctx.sql(query).await?; @@ -89,4 +116,3 @@ mod tests { Ok(()) } } - From 58383c8b61f88839491f21a5082b25df18543b44 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Thu, 18 Dec 2025 16:10:10 -0500 Subject: [PATCH 05/14] Fixed configs and achieved optimal distributed plan. --- tests/join.rs | 64 +++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/tests/join.rs b/tests/join.rs index 80cb6e45..be52c1a9 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -9,39 +9,6 @@ mod tests { DefaultSessionBuilder, display_plan_ascii, test_utils::localhost::start_localhost_context, }; - fn set_optimizer_settings(ctx: &SessionContext) { - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold = 0; - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold_rows = 0; - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .preserve_file_partitions; - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold = 0; - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold_rows = 0; - } - #[tokio::test] async fn test_join_distributed() -> Result<(), Box> { let query = r#" @@ -115,4 +82,35 @@ mod tests { Ok(()) } + + fn set_optimizer_settings(ctx: &SessionContext) { + // Ensure that we always use a partitioned hash join. + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold = 0; + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold_rows = 0; + + // Always preserve file partitions. + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .preserve_file_partitions = 1; + // Set to a high value to ensure we always use the subset satisfaction optimization. + ctx.state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .subset_satisfaction_partition_threshold = 999; + } } From 7de49da445fa191e432d5cd48164d77653dbac45 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Mon, 22 Dec 2025 15:54:42 -0500 Subject: [PATCH 06/14] Refactoring, adding comments. --- tests/join.rs | 132 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 48 deletions(-) diff --git a/tests/join.rs b/tests/join.rs index be52c1a9..8774f936 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -10,9 +10,9 @@ mod tests { }; #[tokio::test] - async fn test_join_distributed() -> Result<(), Box> { + async fn test_join_hive() -> Result<(), Box> { let query = r#" - SELECT + SELECT f.f_dkey, f.timestamp, f.value, @@ -21,36 +21,57 @@ mod tests { d.host FROM dim d INNER JOIN fact f ON d.d_dkey = f.f_dkey + WHERE d.service = 'log' ORDER BY f.f_dkey, f.timestamp "#; - // Execute the query using single node datafusion. - let ctx = SessionContext::new(); - - // Register hive-style partitioning for the dim table. - let dim_options = ParquetReadOptions::default() - .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); - ctx.register_parquet("dim", "testdata/join/parquet/dim", dim_options) - .await?; + // ————————————————————————————————————————————————————————————— + // Execute the query using distributed datafusion, 2 workers, + // and hive-style partitioned data. + // ————————————————————————————————————————————————————————————— - // Register hive-style partitioning for the fact table. - let fact_options = ParquetReadOptions::default() - .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]); - ctx.register_parquet("fact", "testdata/join/parquet/fact", fact_options) - .await?; + let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; - let df = ctx.sql(query).await?; - - let (state, logical_plan) = df.into_parts(); - let physical_plan = state.create_physical_plan(&logical_plan).await?; - // println!("\n——————— PHYSICAL PLAN ———————\n"); - println!("{}", displayable(physical_plan.as_ref()).indent(true)); - - let non_distributed_result = collect(physical_plan, state.task_ctx()).await?; - pretty::print_batches(&non_distributed_result)?; + // Preserve hive-style file partitions. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .preserve_file_partitions = 1; + // Set a high threshold to encourage subset satisfaction. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .subset_satisfaction_partition_threshold = 999; + // Read data from 4 hive-style partitions. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .execution + .target_partitions = 4; + // Ensure that we use a partitioned hash join. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold = 0; + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold_rows = 0; - // Execute the query using distributed datafusion. - let (distributed_ctx, _guard) = start_localhost_context(4, DefaultSessionBuilder).await; // Register hive-style partitioning for the dim table. let dim_options = ParquetReadOptions::default() .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); @@ -65,10 +86,7 @@ mod tests { .register_parquet("fact", "testdata/join/parquet/fact", fact_options) .await?; - set_optimizer_settings(&distributed_ctx); - let df = distributed_ctx.sql(query).await?; - let (state, logical_plan) = df.into_parts(); let physical_plan = state.create_physical_plan(&logical_plan).await?; println!("\n——————— DISTRIBUTED PLAN ———————\n"); @@ -77,40 +95,58 @@ mod tests { let distributed_result = collect(physical_plan, state.task_ctx()).await?; pretty::print_batches(&distributed_result)?; - // Compare single-node and distributed results. - assert_eq!(non_distributed_result, distributed_result); + // ————————————————————————————————————————————————————————————— + // Execute the query using single node datafusion for comparison. + // ————————————————————————————————————————————————————————————— - Ok(()) - } + let ctx = SessionContext::new(); - fn set_optimizer_settings(ctx: &SessionContext) { - // Ensure that we always use a partitioned hash join. ctx.state_ref() .write() .config_mut() .options_mut() .optimizer - .hash_join_single_partition_threshold = 0; - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold_rows = 0; - - // Always preserve file partitions. + .preserve_file_partitions = 1; ctx.state_ref() .write() .config_mut() .options_mut() .optimizer - .preserve_file_partitions = 1; - // Set to a high value to ensure we always use the subset satisfaction optimization. + .subset_satisfaction_partition_threshold = 999; ctx.state_ref() .write() .config_mut() .options_mut() - .optimizer - .subset_satisfaction_partition_threshold = 999; + .execution + .target_partitions = 4; + + // Register hive-style partitioning for the dim table. + let dim_options = ParquetReadOptions::default() + .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); + ctx.register_parquet("dim", "testdata/join/parquet/dim", dim_options) + .await?; + + // Register hive-style partitioning for the fact table. + let fact_options = ParquetReadOptions::default() + .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]); + ctx.register_parquet("fact", "testdata/join/parquet/fact", fact_options) + .await?; + + let df = ctx.sql(query).await?; + let (state, logical_plan) = df.into_parts(); + let physical_plan = state.create_physical_plan(&logical_plan).await?; + println!("\n——————— PHYSICAL PLAN ———————\n"); + println!("{}", displayable(physical_plan.as_ref()).indent(true)); + + let non_distributed_result = collect(physical_plan, state.task_ctx()).await?; + pretty::print_batches(&non_distributed_result)?; + + // ————————————————————————————————————————————————————————————— + // Ensure distributed and single-node results are equivalent. + // ————————————————————————————————————————————————————————————— + + assert_eq!(distributed_result, non_distributed_result); + + Ok(()) } } From 23d7e831b6d8818c269792252355b82cabb18a13 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Mon, 22 Dec 2025 16:47:19 -0500 Subject: [PATCH 07/14] Added check to ensure optimal plan is achieved. --- tests/join.rs | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/tests/join.rs b/tests/join.rs index 8774f936..98339d4c 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -90,13 +90,43 @@ mod tests { let (state, logical_plan) = df.into_parts(); let physical_plan = state.create_physical_plan(&logical_plan).await?; println!("\n——————— DISTRIBUTED PLAN ———————\n"); - println!("{}", display_plan_ascii(physical_plan.as_ref(), false)); + let distributed_plan = display_plan_ascii(physical_plan.as_ref(), false); + println!("{}", distributed_plan); let distributed_result = collect(physical_plan, state.task_ctx()).await?; pretty::print_batches(&distributed_result)?; // ————————————————————————————————————————————————————————————— - // Execute the query using single node datafusion for comparison. + // Ensure the distributed plan matches our target plan, utilizing + // hive-style partitioning. + // ————————————————————————————————————————————————————————————— + + let target_plan = r#"┌───── DistributedExec ── Tasks: t0:[p0] +│ SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, timestamp@1 ASC NULLS LAST] +│ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 +└────────────────────────────────────────────────── + ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] + │ SortExec: expr=[f_dkey@0 ASC NULLS LAST, timestamp@1 ASC NULLS LAST], preserve_partitioning=[true] + │ ProjectionExec: expr=[f_dkey@5 as f_dkey, timestamp@3 as timestamp, value@4 as value, env@0 as env, service@1 as service, host@2 as host] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@3, f_dkey@2)], projection=[env@0, service@1, host@2, timestamp@4, value@5, f_dkey@6] + │ FilterExec: service@1 = log + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/dim/d_dkey=A/data0.parquet], [testdata/join/parquet/dim/d_dkey=B/data0.parquet], [testdata/join/parquet/dim/d_dkey=C/data0.parquet], [testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, host, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + └────────────────────────────────────────────────── + "#; + + assert_eq!( + normalize(&distributed_plan).trim(), + target_plan.trim(), + "Plan mismatch!\nTarget:\n{}\nActual:\n{}", + target_plan, + distributed_plan + ); + + // ————————————————————————————————————————————————————————————— + // Ensure distributed and single-node datafusion results are equivalent. // ————————————————————————————————————————————————————————————— let ctx = SessionContext::new(); @@ -141,12 +171,14 @@ mod tests { let non_distributed_result = collect(physical_plan, state.task_ctx()).await?; pretty::print_batches(&non_distributed_result)?; - // ————————————————————————————————————————————————————————————— - // Ensure distributed and single-node results are equivalent. - // ————————————————————————————————————————————————————————————— - assert_eq!(distributed_result, non_distributed_result); - Ok(()) } + + fn normalize(s: &str) -> String { + let current_dir = std::env::current_dir().unwrap().display().to_string(); + let dir_without_slash = current_dir.trim_start_matches('/'); + s.replace(&format!("{}/", current_dir), "") + .replace(&format!("{}/", dir_without_slash), "") + } } From 543990be07af776c9ac3302d4dfcadc4212d9816 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Mon, 22 Dec 2025 21:57:31 -0500 Subject: [PATCH 08/14] Update based on Nga's comments. --- tests/join.rs | 128 ++++++++++++++++++++++++-------------------------- 1 file changed, 61 insertions(+), 67 deletions(-) diff --git a/tests/join.rs b/tests/join.rs index 98339d4c..d728b337 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -2,13 +2,21 @@ mod tests { use arrow::{datatypes::DataType, util::pretty}; use datafusion::{ - physical_plan::{collect, displayable}, - prelude::{ParquetReadOptions, SessionContext}, + assert_batches_sorted_eq, + physical_plan::collect, + prelude::{ParquetReadOptions, col}, }; use datafusion_distributed::{ DefaultSessionBuilder, display_plan_ascii, test_utils::localhost::start_localhost_context, }; + fn normalize(s: &str) -> String { + let current_dir = std::env::current_dir().unwrap().display().to_string(); + let dir_without_slash = current_dir.trim_start_matches('/'); + s.replace(&format!("{}/", current_dir), "") + .replace(&format!("{}/", dir_without_slash), "") + } + #[tokio::test] async fn test_join_hive() -> Result<(), Box> { let query = r#" @@ -22,7 +30,6 @@ mod tests { FROM dim d INNER JOIN fact f ON d.d_dkey = f.f_dkey WHERE d.service = 'log' - ORDER BY f.f_dkey, f.timestamp "#; // ————————————————————————————————————————————————————————————— @@ -81,7 +88,12 @@ mod tests { // Register hive-style partitioning for the fact table. let fact_options = ParquetReadOptions::default() - .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]); + .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]) + // TODO: Figure out why file sort order does not display in plan. + .file_sort_order(vec![vec![ + col("f_dkey").sort(true, true), + col("timestamp").sort(true, true), + ]]); distributed_ctx .register_parquet("fact", "testdata/join/parquet/fact", fact_options) .await?; @@ -93,27 +105,26 @@ mod tests { let distributed_plan = display_plan_ascii(physical_plan.as_ref(), false); println!("{}", distributed_plan); - let distributed_result = collect(physical_plan, state.task_ctx()).await?; - pretty::print_batches(&distributed_result)?; + let distributed_results = collect(physical_plan, state.task_ctx()).await?; + pretty::print_batches(&distributed_results)?; // ————————————————————————————————————————————————————————————— - // Ensure the distributed plan matches our target plan, utilizing - // hive-style partitioning. + // Ensure the distributed plan matches our target plan, registering + // hive-style partitioning and avoiding data-shuffling repartitions. // ————————————————————————————————————————————————————————————— let target_plan = r#"┌───── DistributedExec ── Tasks: t0:[p0] -│ SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, timestamp@1 ASC NULLS LAST] +│ CoalescePartitionsExec │ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 └────────────────────────────────────────────────── ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] - │ SortExec: expr=[f_dkey@0 ASC NULLS LAST, timestamp@1 ASC NULLS LAST], preserve_partitioning=[true] - │ ProjectionExec: expr=[f_dkey@5 as f_dkey, timestamp@3 as timestamp, value@4 as value, env@0 as env, service@1 as service, host@2 as host] - │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@3, f_dkey@2)], projection=[env@0, service@1, host@2, timestamp@4, value@5, f_dkey@6] - │ FilterExec: service@1 = log - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/dim/d_dkey=A/data0.parquet], [testdata/join/parquet/dim/d_dkey=B/data0.parquet], [testdata/join/parquet/dim/d_dkey=C/data0.parquet], [testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, host, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ ProjectionExec: expr=[f_dkey@5 as f_dkey, timestamp@3 as timestamp, value@4 as value, env@0 as env, service@1 as service, host@2 as host] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@3, f_dkey@2)], projection=[env@0, service@1, host@2, timestamp@4, value@5, f_dkey@6] + │ FilterExec: service@1 = log │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/dim/d_dkey=A/data0.parquet], [testdata/join/parquet/dim/d_dkey=B/data0.parquet], [testdata/join/parquet/dim/d_dkey=C/data0.parquet], [testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, host, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "#; @@ -126,59 +137,42 @@ mod tests { ); // ————————————————————————————————————————————————————————————— - // Ensure distributed and single-node datafusion results are equivalent. + // Ensure distributed results are correct. // ————————————————————————————————————————————————————————————— - let ctx = SessionContext::new(); - - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .preserve_file_partitions = 1; - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .subset_satisfaction_partition_threshold = 999; - ctx.state_ref() - .write() - .config_mut() - .options_mut() - .execution - .target_partitions = 4; - - // Register hive-style partitioning for the dim table. - let dim_options = ParquetReadOptions::default() - .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); - ctx.register_parquet("dim", "testdata/join/parquet/dim", dim_options) - .await?; - - // Register hive-style partitioning for the fact table. - let fact_options = ParquetReadOptions::default() - .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]); - ctx.register_parquet("fact", "testdata/join/parquet/fact", fact_options) - .await?; - - let df = ctx.sql(query).await?; - let (state, logical_plan) = df.into_parts(); - let physical_plan = state.create_physical_plan(&logical_plan).await?; - println!("\n——————— PHYSICAL PLAN ———————\n"); - println!("{}", displayable(physical_plan.as_ref()).indent(true)); - - let non_distributed_result = collect(physical_plan, state.task_ctx()).await?; - pretty::print_batches(&non_distributed_result)?; - - assert_eq!(distributed_result, non_distributed_result); + let expected = vec![ + "+--------+---------------------+-------+------+---------+--------+", + "| f_dkey | timestamp | value | env | service | host |", + "+--------+---------------------+-------+------+---------+--------+", + "| A | 2023-01-01T09:00:00 | 95.5 | dev | log | host-y |", + "| A | 2023-01-01T09:00:10 | 102.3 | dev | log | host-y |", + "| A | 2023-01-01T09:00:20 | 98.7 | dev | log | host-y |", + "| A | 2023-01-01T09:12:20 | 105.1 | dev | log | host-y |", + "| A | 2023-01-01T09:12:30 | 100.0 | dev | log | host-y |", + "| A | 2023-01-01T09:12:40 | 150.0 | dev | log | host-y |", + "| A | 2023-01-01T09:12:50 | 120.8 | dev | log | host-y |", + "| B | 2023-01-01T11:00:00 | 72.8 | prod | log | host-x |", + "| B | 2023-01-01T11:00:10 | 79.4 | prod | log | host-x |", + "| B | 2023-01-01T11:00:20 | 76.1 | prod | log | host-x |", + "| B | 2023-01-01T11:00:30 | 83.7 | prod | log | host-x |", + "| B | 2023-01-01T11:12:30 | 77.2 | prod | log | host-x |", + "| B | 2023-01-01T09:00:00 | 75.2 | prod | log | host-x |", + "| B | 2023-01-01T09:00:10 | 82.4 | prod | log | host-x |", + "| B | 2023-01-01T09:00:20 | 78.9 | prod | log | host-x |", + "| B | 2023-01-01T09:00:30 | 85.6 | prod | log | host-x |", + "| B | 2023-01-01T09:12:30 | 80.0 | prod | log | host-x |", + "| B | 2023-01-01T09:12:40 | 120.0 | prod | log | host-x |", + "| B | 2023-01-01T09:12:50 | 92.3 | prod | log | host-x |", + "| B | 2023-01-01T10:00:00 | 88.5 | prod | log | host-x |", + "| B | 2023-01-01T10:00:10 | 91.2 | prod | log | host-x |", + "| B | 2023-01-01T10:00:20 | 87.3 | prod | log | host-x |", + "| B | 2023-01-01T10:00:30 | 94.1 | prod | log | host-x |", + "| B | 2023-01-01T10:12:30 | 89.5 | prod | log | host-x |", + "| B | 2023-01-01T10:12:40 | 95.8 | prod | log | host-x |", + "+--------+---------------------+-------+------+---------+--------+", + ]; + + assert_batches_sorted_eq!(expected, &distributed_results); Ok(()) } - - fn normalize(s: &str) -> String { - let current_dir = std::env::current_dir().unwrap().display().to_string(); - let dir_without_slash = current_dir.trim_start_matches('/'); - s.replace(&format!("{}/", current_dir), "") - .replace(&format!("{}/", dir_without_slash), "") - } } From 524f1b0740f380d94e1898e8ece6eac6ba0d5ec6 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Tue, 23 Dec 2025 11:34:40 -0500 Subject: [PATCH 09/14] Added second test. --- Cargo.lock | 68 ++++++++++---------- Cargo.toml | 6 +- tests/join.rs | 167 +++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 191 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d76789cd..7946332d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1218,7 +1218,7 @@ dependencies = [ [[package]] name = "datafusion" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "arrow-schema", @@ -1268,7 +1268,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", @@ -1292,7 +1292,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", @@ -1314,7 +1314,7 @@ dependencies = [ [[package]] name = "datafusion-common" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "ahash", "arrow", @@ -1336,7 +1336,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "futures", "log", @@ -1346,7 +1346,7 @@ dependencies = [ [[package]] name = "datafusion-datasource" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", @@ -1374,7 +1374,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "arrow-ipc", @@ -1397,7 +1397,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", @@ -1419,7 +1419,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", @@ -1440,7 +1440,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", @@ -1533,16 +1533,15 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" [[package]] name = "datafusion-execution" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", - "chrono", "dashmap", "datafusion-common", "datafusion-expr", @@ -1558,7 +1557,7 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", @@ -1579,7 +1578,7 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "datafusion-common", @@ -1591,7 +1590,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "arrow-buffer", @@ -1616,7 +1615,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "ahash", "arrow", @@ -1636,7 +1635,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "ahash", "arrow", @@ -1648,7 +1647,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "arrow-ord", @@ -1670,7 +1669,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "async-trait", @@ -1685,7 +1684,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "datafusion-common", @@ -1702,7 +1701,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1711,7 +1710,7 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "datafusion-doc", "quote", @@ -1721,7 +1720,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "chrono", @@ -1739,7 +1738,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "ahash", "arrow", @@ -1761,7 +1760,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "datafusion-common", @@ -1775,7 +1774,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "ahash", "arrow", @@ -1788,7 +1787,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "datafusion-common", @@ -1805,13 +1804,14 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "ahash", "arrow", "arrow-ord", "arrow-schema", "async-trait", + "chrono", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", @@ -1835,7 +1835,7 @@ dependencies = [ [[package]] name = "datafusion-proto" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "chrono", @@ -1861,7 +1861,7 @@ dependencies = [ [[package]] name = "datafusion-proto-common" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "datafusion-common", @@ -1871,7 +1871,7 @@ dependencies = [ [[package]] name = "datafusion-pruning" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "datafusion-common", @@ -1887,7 +1887,7 @@ dependencies = [ [[package]] name = "datafusion-session" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "async-trait", "datafusion-common", @@ -1900,7 +1900,7 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "51.0.0" -source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_partitioning_satisfies_subset#24886c6ed1b73a3e5fe2e7c1a221028c1fccec30" +source = "git+https://github.com/gene-bordegaray/datafusion.git?branch=gene.bordegaray%2F2025%2F12%2Fhash_superset_satisfies_partitioning#510b758e89bd04db2f0ed649e733c01a7e6a6f8b" dependencies = [ "arrow", "bigdecimal", diff --git a/Cargo.toml b/Cargo.toml index c631e700..613de8c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,8 +7,10 @@ members = ["benchmarks"] # This PR includes: # - Hash subset satisfaction logic (eliminates unnecessary repartitions) # - File partitioning preservation (from PR #19124) -datafusion = { git = "https://github.com/gene-bordegaray/datafusion.git", branch = "gene.bordegaray/2025/12/hash_partitioning_satisfies_subset", default-features = false } -datafusion-proto = { git = "https://github.com/gene-bordegaray/datafusion.git", branch = "gene.bordegaray/2025/12/hash_partitioning_satisfies_subset" } +# datafusion = { git = "https://github.com/gene-bordegaray/datafusion.git", branch = "gene.bordegaray/2025/12/hash_partitioning_satisfies_subset", default-features = false } +# datafusion-proto = { git = "https://github.com/gene-bordegaray/datafusion.git", branch = "gene.bordegaray/2025/12/hash_partitioning_satisfies_subset" } + datafusion = { git = "https://github.com/gene-bordegaray/datafusion.git", branch = "gene.bordegaray/2025/12/hash_superset_satisfies_partitioning", default-features = false } + datafusion-proto = { git = "https://github.com/gene-bordegaray/datafusion.git", branch = "gene.bordegaray/2025/12/hash_superset_satisfies_partitioning" } [package] name = "datafusion-distributed" diff --git a/tests/join.rs b/tests/join.rs index d728b337..fd7b7ab8 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -15,6 +15,7 @@ mod tests { let dir_without_slash = current_dir.trim_start_matches('/'); s.replace(&format!("{}/", current_dir), "") .replace(&format!("{}/", dir_without_slash), "") + .replace(" ", "") } #[tokio::test] @@ -47,14 +48,6 @@ mod tests { .options_mut() .optimizer .preserve_file_partitions = 1; - // Set a high threshold to encourage subset satisfaction. - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .subset_satisfaction_partition_threshold = 999; // Read data from 4 hive-style partitions. distributed_ctx .state_ref() @@ -101,9 +94,8 @@ mod tests { let df = distributed_ctx.sql(query).await?; let (state, logical_plan) = df.into_parts(); let physical_plan = state.create_physical_plan(&logical_plan).await?; - println!("\n——————— DISTRIBUTED PLAN ———————\n"); let distributed_plan = display_plan_ascii(physical_plan.as_ref(), false); - println!("{}", distributed_plan); + println!("\n——————— DISTRIBUTED PLAN ———————\n\n{}", distributed_plan); let distributed_results = collect(physical_plan, state.task_ctx()).await?; pretty::print_batches(&distributed_results)?; @@ -128,12 +120,12 @@ mod tests { └────────────────────────────────────────────────── "#; + let normalized_distributed = normalize(&distributed_plan); + let normalized_target = normalize(&target_plan); assert_eq!( - normalize(&distributed_plan).trim(), - target_plan.trim(), + normalized_distributed, normalized_target, "Plan mismatch!\nTarget:\n{}\nActual:\n{}", - target_plan, - distributed_plan + normalized_target, normalized_distributed ); // ————————————————————————————————————————————————————————————— @@ -175,4 +167,151 @@ mod tests { assert_batches_sorted_eq!(expected, &distributed_results); Ok(()) } + + #[tokio::test] + async fn test_join_agg_hive() -> Result<(), Box> { + let query = r#" + SELECT f_dkey, + date_bin(INTERVAL '30 seconds', timestamp) AS time_bin, + env, + MAX(value) AS max_bin_value + FROM + ( + SELECT + f.f_dkey, + d.env, + d.service, + d.host, + f.timestamp, + f.value + FROM dim d + INNER JOIN fact f ON d.d_dkey = f.f_dkey + WHERE service = 'log' + ) AS j + GROUP BY f_dkey, time_bin, env + "#; + + // ————————————————————————————————————————————————————————————— + // Execute the query using distributed datafusion, 2 workers, + // and hive-style partitioned data. + // ————————————————————————————————————————————————————————————— + + let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; + + // Preserve hive-style file partitions. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .preserve_file_partitions = 1; + // Read data from 4 hive-style partitions. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .execution + .target_partitions = 4; + // Ensure that we use a partitioned hash join. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold = 0; + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold_rows = 0; + + // Register hive-style partitioning for the dim table. + let dim_options = ParquetReadOptions::default() + .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); + distributed_ctx + .register_parquet("dim", "testdata/join/parquet/dim", dim_options) + .await?; + + // Register hive-style partitioning for the fact table. + let fact_options = ParquetReadOptions::default() + .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]) + // TODO: Figure out why file sort order does not display in plan. + .file_sort_order(vec![vec![ + col("f_dkey").sort(true, true), + col("timestamp").sort(true, true), + ]]); + distributed_ctx + .register_parquet("fact", "testdata/join/parquet/fact", fact_options) + .await?; + + let df = distributed_ctx.sql(query).await?; + let (state, logical_plan) = df.into_parts(); + let physical_plan = state.create_physical_plan(&logical_plan).await?; + let distributed_plan = display_plan_ascii(physical_plan.as_ref(), false); + println!("\n——————— DISTRIBUTED PLAN ———————\n\n{}", distributed_plan); + + let distributed_results = collect(physical_plan, state.task_ctx()).await?; + pretty::print_batches(&distributed_results)?; + + // ————————————————————————————————————————————————————————————— + // Ensure the distributed plan matches our target plan, registering + // hive-style partitioning and avoiding data-shuffling repartitions. + // ————————————————————————————————————————————————————————————— + + let target_plan = r#"┌───── DistributedExec ── Tasks: t0:[p0] +│ CoalescePartitionsExec +│ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 +└────────────────────────────────────────────────── + ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] + │ ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] + │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) + │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] + │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/dim/d_dkey=A/data0.parquet], [testdata/join/parquet/dim/d_dkey=B/data0.parquet], [testdata/join/parquet/dim/d_dkey=C/data0.parquet], [testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + └────────────────────────────────────────────────── + "#; + + let normalized_distributed = normalize(&distributed_plan); + let normalized_target = normalize(&target_plan); + assert_eq!( + normalized_distributed, normalized_target, + "Plan mismatch!\nTarget:\n{}\nActual:\n{}", + normalized_target, normalized_distributed + ); + + // ————————————————————————————————————————————————————————————— + // Ensure distributed results are correct. + // ————————————————————————————————————————————————————————————— + + let expected = vec![ + "+--------+---------------------+------+---------------+", + "| f_dkey | time_bin | env | max_bin_value |", + "+--------+---------------------+------+---------------+", + "| A | 2023-01-01T09:00:00 | dev | 102.3 |", + "| A | 2023-01-01T09:12:00 | dev | 105.1 |", + "| A | 2023-01-01T09:12:30 | dev | 150.0 |", + "| B | 2023-01-01T11:00:00 | prod | 79.4 |", + "| B | 2023-01-01T11:00:30 | prod | 83.7 |", + "| B | 2023-01-01T11:12:30 | prod | 77.2 |", + "| B | 2023-01-01T09:00:00 | prod | 82.4 |", + "| B | 2023-01-01T09:00:30 | prod | 85.6 |", + "| B | 2023-01-01T09:12:30 | prod | 120.0 |", + "| B | 2023-01-01T10:00:00 | prod | 91.2 |", + "| B | 2023-01-01T10:00:30 | prod | 94.1 |", + "| B | 2023-01-01T10:12:30 | prod | 95.8 |", + "+--------+---------------------+------+---------------+", + ]; + + assert_batches_sorted_eq!(expected, &distributed_results); + Ok(()) + } } From 2ded3217add3d0a0162ccbc419c0e441dd116e7a Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Tue, 23 Dec 2025 11:44:12 -0500 Subject: [PATCH 10/14] Added third test. --- tests/join.rs | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/tests/join.rs b/tests/join.rs index fd7b7ab8..c26b8d41 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -314,4 +314,163 @@ mod tests { assert_batches_sorted_eq!(expected, &distributed_results); Ok(()) } + + #[tokio::test] + async fn test_join_time_space_agg_hive() -> Result<(), Box> { + let query = r#" + SELECT env, time_bin, AVG(max_bin_value) AS avg_max_value + FROM + ( + SELECT f_dkey, + date_bin(INTERVAL '30 seconds', timestamp) AS time_bin, + env, + MAX(value) AS max_bin_value + FROM + ( + SELECT + f.f_dkey, + d.env, + d.service, + d.host, + f.timestamp, + f.value + FROM dim d + INNER JOIN fact f ON d.d_dkey = f.f_dkey + WHERE service = 'log' + ) AS j + GROUP BY f_dkey, time_bin, env + ) AS a + GROUP BY env, time_bin + ORDER BY env, time_bin + "#; + + // ————————————————————————————————————————————————————————————— + // Execute the query using distributed datafusion, 2 workers, + // and hive-style partitioned data. + // ————————————————————————————————————————————————————————————— + + let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; + + // Preserve hive-style file partitions. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .preserve_file_partitions = 1; + // Read data from 4 hive-style partitions. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .execution + .target_partitions = 4; + // Ensure that we use a partitioned hash join. + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold = 0; + distributed_ctx + .state_ref() + .write() + .config_mut() + .options_mut() + .optimizer + .hash_join_single_partition_threshold_rows = 0; + + // Register hive-style partitioning for the dim table. + let dim_options = ParquetReadOptions::default() + .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); + distributed_ctx + .register_parquet("dim", "testdata/join/parquet/dim", dim_options) + .await?; + + // Register hive-style partitioning for the fact table. + let fact_options = ParquetReadOptions::default() + .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]) + // TODO: Figure out why file sort order does not display in plan. + .file_sort_order(vec![vec![ + col("f_dkey").sort(true, true), + col("timestamp").sort(true, true), + ]]); + distributed_ctx + .register_parquet("fact", "testdata/join/parquet/fact", fact_options) + .await?; + + let df = distributed_ctx.sql(query).await?; + let (state, logical_plan) = df.into_parts(); + let physical_plan = state.create_physical_plan(&logical_plan).await?; + let distributed_plan = display_plan_ascii(physical_plan.as_ref(), false); + println!("\n——————— DISTRIBUTED PLAN ———————\n\n{}", distributed_plan); + + let distributed_results = collect(physical_plan, state.task_ctx()).await?; + pretty::print_batches(&distributed_results)?; + + // ————————————————————————————————————————————————————————————— + // Ensure the distributed plan matches our target plan, registering + // hive-style partitioning and avoiding data-shuffling repartitions. + // ————————————————————————————————————————————————————————————— + + let target_plan = r#"┌───── DistributedExec ── Tasks: t0:[p0] +│ SortPreservingMergeExec: [env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST] +│ SortExec: expr=[env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true] +│ ProjectionExec: expr=[env@0 as env, time_bin@1 as time_bin, avg(a.max_bin_value)@2 as avg_max_value] +│ AggregateExec: mode=FinalPartitioned, gby=[env@0 as env, time_bin@1 as time_bin], aggr=[avg(a.max_bin_value)] +│ [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=2 +└────────────────────────────────────────────────── + ┌───── Stage 1 ── Tasks: t0:[p0..p3] t1:[p0..p3] + │ CoalesceBatchesExec: target_batch_size=8192 + │ RepartitionExec: partitioning=Hash([env@0, time_bin@1], 4), input_partitions=2 + │ AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)] + │ ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] + │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) + │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] + │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/dim/d_dkey=A/data0.parquet], [testdata/join/parquet/dim/d_dkey=B/data0.parquet], [testdata/join/parquet/dim/d_dkey=C/data0.parquet], [testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + └────────────────────────────────────────────────── + "#; + + let normalized_distributed = normalize(&distributed_plan); + let normalized_target = normalize(&target_plan); + assert_eq!( + normalized_distributed, normalized_target, + "Plan mismatch!\nTarget:\n{}\nActual:\n{}", + normalized_target, normalized_distributed + ); + + // ————————————————————————————————————————————————————————————— + // Ensure distributed results are correct. + // ————————————————————————————————————————————————————————————— + + let expected = vec![ + "+------+---------------------+---------------+", + "| env | time_bin | avg_max_value |", + "+------+---------------------+---------------+", + "| dev | 2023-01-01T09:00:00 | 102.3 |", + "| dev | 2023-01-01T09:12:00 | 105.1 |", + "| dev | 2023-01-01T09:12:30 | 150.0 |", + "| prod | 2023-01-01T09:00:00 | 82.4 |", + "| prod | 2023-01-01T09:00:30 | 85.6 |", + "| prod | 2023-01-01T09:12:30 | 120.0 |", + "| prod | 2023-01-01T10:00:00 | 91.2 |", + "| prod | 2023-01-01T10:00:30 | 94.1 |", + "| prod | 2023-01-01T10:12:30 | 95.8 |", + "| prod | 2023-01-01T11:00:00 | 79.4 |", + "| prod | 2023-01-01T11:00:30 | 83.7 |", + "| prod | 2023-01-01T11:12:30 | 77.2 |", + "+------+---------------------+---------------+", + ]; + + assert_batches_sorted_eq!(expected, &distributed_results); + Ok(()) + } } From b157d0ee943c21779569b7a8a4c442682027c66f Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Tue, 23 Dec 2025 13:31:38 -0500 Subject: [PATCH 11/14] Refactor. --- tests/join.rs | 265 +++++++++++++++----------------------------------- 1 file changed, 76 insertions(+), 189 deletions(-) diff --git a/tests/join.rs b/tests/join.rs index c26b8d41..e38a5c8d 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -1,82 +1,51 @@ #[cfg(test)] mod tests { - use arrow::{datatypes::DataType, util::pretty}; + use arrow::{array::RecordBatch, datatypes::DataType, util::pretty}; use datafusion::{ assert_batches_sorted_eq, + error::Result, physical_plan::collect, - prelude::{ParquetReadOptions, col}, + prelude::{ParquetReadOptions, SessionContext, col}, }; use datafusion_distributed::{ DefaultSessionBuilder, display_plan_ascii, test_utils::localhost::start_localhost_context, }; - fn normalize(s: &str) -> String { - let current_dir = std::env::current_dir().unwrap().display().to_string(); - let dir_without_slash = current_dir.trim_start_matches('/'); - s.replace(&format!("{}/", current_dir), "") - .replace(&format!("{}/", dir_without_slash), "") - .replace(" ", "") - } - - #[tokio::test] - async fn test_join_hive() -> Result<(), Box> { - let query = r#" - SELECT - f.f_dkey, - f.timestamp, - f.value, - d.env, - d.service, - d.host - FROM dim d - INNER JOIN fact f ON d.d_dkey = f.f_dkey - WHERE d.service = 'log' - "#; - - // ————————————————————————————————————————————————————————————— - // Execute the query using distributed datafusion, 2 workers, - // and hive-style partitioned data. - // ————————————————————————————————————————————————————————————— - - let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; - + fn set_configs(ctx: &SessionContext) { // Preserve hive-style file partitions. - distributed_ctx - .state_ref() + ctx.state_ref() .write() .config_mut() .options_mut() .optimizer .preserve_file_partitions = 1; // Read data from 4 hive-style partitions. - distributed_ctx - .state_ref() + ctx.state_ref() .write() .config_mut() .options_mut() .execution .target_partitions = 4; // Ensure that we use a partitioned hash join. - distributed_ctx - .state_ref() + ctx.state_ref() .write() .config_mut() .options_mut() .optimizer .hash_join_single_partition_threshold = 0; - distributed_ctx - .state_ref() + ctx.state_ref() .write() .config_mut() .options_mut() .optimizer .hash_join_single_partition_threshold_rows = 0; + } + async fn register_tables(ctx: &SessionContext) -> Result<()> { // Register hive-style partitioning for the dim table. let dim_options = ParquetReadOptions::default() .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); - distributed_ctx - .register_parquet("dim", "testdata/join/parquet/dim", dim_options) + ctx.register_parquet("dim", "testdata/join/parquet/dim", dim_options) .await?; // Register hive-style partitioning for the fact table. @@ -87,11 +56,16 @@ mod tests { col("f_dkey").sort(true, true), col("timestamp").sort(true, true), ]]); - distributed_ctx - .register_parquet("fact", "testdata/join/parquet/fact", fact_options) + ctx.register_parquet("fact", "testdata/join/parquet/fact", fact_options) .await?; + Ok(()) + } - let df = distributed_ctx.sql(query).await?; + async fn execute_query( + ctx: &SessionContext, + query: &'static str, + ) -> Result<(String, Vec)> { + let df = ctx.sql(query).await?; let (state, logical_plan) = df.into_parts(); let physical_plan = state.create_physical_plan(&logical_plan).await?; let distributed_plan = display_plan_ascii(physical_plan.as_ref(), false); @@ -99,6 +73,52 @@ mod tests { let distributed_results = collect(physical_plan, state.task_ctx()).await?; pretty::print_batches(&distributed_results)?; + Ok((distributed_plan, distributed_results)) + } + + fn validate_plan(plan: String, target_plan: &'static str) { + let normalized_distributed = normalize(&plan); + let normalized_target = normalize(&target_plan); + assert_eq!( + normalized_distributed, normalized_target, + "Plan mismatch!\nTarget:\n{}\nActual:\n{}", + normalized_target, normalized_distributed + ); + } + + fn normalize(s: &str) -> String { + let current_dir = std::env::current_dir().unwrap().display().to_string(); + let dir_without_slash = current_dir.trim_start_matches('/'); + s.replace(&format!("{}/", current_dir), "") + .replace(&format!("{}/", dir_without_slash), "") + .replace(" ", "") + } + + #[tokio::test] + async fn test_join_hive() -> Result<(), Box> { + let query = r#" + SELECT + f.f_dkey, + f.timestamp, + f.value, + d.env, + d.service, + d.host + FROM dim d + INNER JOIN fact f ON d.d_dkey = f.f_dkey + WHERE d.service = 'log' + "#; + + // ————————————————————————————————————————————————————————————— + // Execute the query using distributed datafusion, 2 workers, + // and hive-style partitioned data. + // ————————————————————————————————————————————————————————————— + + let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; + set_configs(&distributed_ctx); + register_tables(&distributed_ctx).await?; + let (distributed_plan, distributed_results) = + execute_query(&distributed_ctx, query).await?; // ————————————————————————————————————————————————————————————— // Ensure the distributed plan matches our target plan, registering @@ -119,14 +139,7 @@ mod tests { │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "#; - - let normalized_distributed = normalize(&distributed_plan); - let normalized_target = normalize(&target_plan); - assert_eq!( - normalized_distributed, normalized_target, - "Plan mismatch!\nTarget:\n{}\nActual:\n{}", - normalized_target, normalized_distributed - ); + validate_plan(distributed_plan, target_plan); // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. @@ -197,66 +210,10 @@ mod tests { // ————————————————————————————————————————————————————————————— let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; - - // Preserve hive-style file partitions. - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .preserve_file_partitions = 1; - // Read data from 4 hive-style partitions. - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .execution - .target_partitions = 4; - // Ensure that we use a partitioned hash join. - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold = 0; - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold_rows = 0; - - // Register hive-style partitioning for the dim table. - let dim_options = ParquetReadOptions::default() - .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); - distributed_ctx - .register_parquet("dim", "testdata/join/parquet/dim", dim_options) - .await?; - - // Register hive-style partitioning for the fact table. - let fact_options = ParquetReadOptions::default() - .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]) - // TODO: Figure out why file sort order does not display in plan. - .file_sort_order(vec![vec![ - col("f_dkey").sort(true, true), - col("timestamp").sort(true, true), - ]]); - distributed_ctx - .register_parquet("fact", "testdata/join/parquet/fact", fact_options) - .await?; - - let df = distributed_ctx.sql(query).await?; - let (state, logical_plan) = df.into_parts(); - let physical_plan = state.create_physical_plan(&logical_plan).await?; - let distributed_plan = display_plan_ascii(physical_plan.as_ref(), false); - println!("\n——————— DISTRIBUTED PLAN ———————\n\n{}", distributed_plan); - - let distributed_results = collect(physical_plan, state.task_ctx()).await?; - pretty::print_batches(&distributed_results)?; + set_configs(&distributed_ctx); + register_tables(&distributed_ctx).await?; + let (distributed_plan, distributed_results) = + execute_query(&distributed_ctx, query).await?; // ————————————————————————————————————————————————————————————— // Ensure the distributed plan matches our target plan, registering @@ -279,14 +236,7 @@ mod tests { │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "#; - - let normalized_distributed = normalize(&distributed_plan); - let normalized_target = normalize(&target_plan); - assert_eq!( - normalized_distributed, normalized_target, - "Plan mismatch!\nTarget:\n{}\nActual:\n{}", - normalized_target, normalized_distributed - ); + validate_plan(distributed_plan, target_plan); // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. @@ -350,66 +300,10 @@ mod tests { // ————————————————————————————————————————————————————————————— let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; - - // Preserve hive-style file partitions. - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .preserve_file_partitions = 1; - // Read data from 4 hive-style partitions. - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .execution - .target_partitions = 4; - // Ensure that we use a partitioned hash join. - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold = 0; - distributed_ctx - .state_ref() - .write() - .config_mut() - .options_mut() - .optimizer - .hash_join_single_partition_threshold_rows = 0; - - // Register hive-style partitioning for the dim table. - let dim_options = ParquetReadOptions::default() - .table_partition_cols(vec![("d_dkey".to_string(), DataType::Utf8)]); - distributed_ctx - .register_parquet("dim", "testdata/join/parquet/dim", dim_options) - .await?; - - // Register hive-style partitioning for the fact table. - let fact_options = ParquetReadOptions::default() - .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]) - // TODO: Figure out why file sort order does not display in plan. - .file_sort_order(vec![vec![ - col("f_dkey").sort(true, true), - col("timestamp").sort(true, true), - ]]); - distributed_ctx - .register_parquet("fact", "testdata/join/parquet/fact", fact_options) - .await?; - - let df = distributed_ctx.sql(query).await?; - let (state, logical_plan) = df.into_parts(); - let physical_plan = state.create_physical_plan(&logical_plan).await?; - let distributed_plan = display_plan_ascii(physical_plan.as_ref(), false); - println!("\n——————— DISTRIBUTED PLAN ———————\n\n{}", distributed_plan); - - let distributed_results = collect(physical_plan, state.task_ctx()).await?; - pretty::print_batches(&distributed_results)?; + set_configs(&distributed_ctx); + register_tables(&distributed_ctx).await?; + let (distributed_plan, distributed_results) = + execute_query(&distributed_ctx, query).await?; // ————————————————————————————————————————————————————————————— // Ensure the distributed plan matches our target plan, registering @@ -438,14 +332,7 @@ mod tests { │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "#; - - let normalized_distributed = normalize(&distributed_plan); - let normalized_target = normalize(&target_plan); - assert_eq!( - normalized_distributed, normalized_target, - "Plan mismatch!\nTarget:\n{}\nActual:\n{}", - normalized_target, normalized_distributed - ); + validate_plan(distributed_plan, target_plan); // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. From 23da3d7dbfe9582101b1c13870ce9d5ef0f7d500 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Mon, 29 Dec 2025 14:41:50 -0500 Subject: [PATCH 12/14] Add ORDER BY to queries and switch to snapshot testing. --- tests/join.rs | 284 ++++++++++++++++++++++++-------------------------- 1 file changed, 138 insertions(+), 146 deletions(-) diff --git a/tests/join.rs b/tests/join.rs index e38a5c8d..5523da1f 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -1,14 +1,18 @@ #[cfg(test)] mod tests { - use arrow::{array::RecordBatch, datatypes::DataType, util::pretty}; + use arrow::{ + array::RecordBatch, + datatypes::DataType, + util::pretty::{self, pretty_format_batches}, + }; use datafusion::{ - assert_batches_sorted_eq, error::Result, physical_plan::collect, prelude::{ParquetReadOptions, SessionContext, col}, }; use datafusion_distributed::{ - DefaultSessionBuilder, display_plan_ascii, test_utils::localhost::start_localhost_context, + DefaultSessionBuilder, assert_snapshot, display_plan_ascii, + test_utils::localhost::start_localhost_context, }; fn set_configs(ctx: &SessionContext) { @@ -76,24 +80,6 @@ mod tests { Ok((distributed_plan, distributed_results)) } - fn validate_plan(plan: String, target_plan: &'static str) { - let normalized_distributed = normalize(&plan); - let normalized_target = normalize(&target_plan); - assert_eq!( - normalized_distributed, normalized_target, - "Plan mismatch!\nTarget:\n{}\nActual:\n{}", - normalized_target, normalized_distributed - ); - } - - fn normalize(s: &str) -> String { - let current_dir = std::env::current_dir().unwrap().display().to_string(); - let dir_without_slash = current_dir.trim_start_matches('/'); - s.replace(&format!("{}/", current_dir), "") - .replace(&format!("{}/", dir_without_slash), "") - .replace(" ", "") - } - #[tokio::test] async fn test_join_hive() -> Result<(), Box> { let query = r#" @@ -107,6 +93,7 @@ mod tests { FROM dim d INNER JOIN fact f ON d.d_dkey = f.f_dkey WHERE d.service = 'log' + ORDER BY f_dkey, timestamp "#; // ————————————————————————————————————————————————————————————— @@ -125,59 +112,62 @@ mod tests { // hive-style partitioning and avoiding data-shuffling repartitions. // ————————————————————————————————————————————————————————————— - let target_plan = r#"┌───── DistributedExec ── Tasks: t0:[p0] -│ CoalescePartitionsExec -│ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 -└────────────────────────────────────────────────── - ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] - │ ProjectionExec: expr=[f_dkey@5 as f_dkey, timestamp@3 as timestamp, value@4 as value, env@0 as env, service@1 as service, host@2 as host] - │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@3, f_dkey@2)], projection=[env@0, service@1, host@2, timestamp@4, value@5, f_dkey@6] - │ FilterExec: service@1 = log - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/dim/d_dkey=A/data0.parquet], [testdata/join/parquet/dim/d_dkey=B/data0.parquet], [testdata/join/parquet/dim/d_dkey=C/data0.parquet], [testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, host, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] - └────────────────────────────────────────────────── - "#; - validate_plan(distributed_plan, target_plan); + assert_snapshot!(&distributed_plan, + @" + ┌───── DistributedExec ── Tasks: t0:[p0] + │ SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, timestamp@1 ASC NULLS LAST] + │ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 + └────────────────────────────────────────────────── + ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] + │ SortExec: expr=[f_dkey@0 ASC NULLS LAST, timestamp@1 ASC NULLS LAST], preserve_partitioning=[true] + │ ProjectionExec: expr=[f_dkey@5 as f_dkey, timestamp@3 as timestamp, value@4 as value, env@0 as env, service@1 as service, host@2 as host] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@3, f_dkey@2)], projection=[env@0, service@1, host@2, timestamp@4, value@5, f_dkey@6] + │ FilterExec: service@1 = log + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, host, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data2.parquet, /testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + └────────────────────────────────────────────────── + "); // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. // ————————————————————————————————————————————————————————————— - let expected = vec![ - "+--------+---------------------+-------+------+---------+--------+", - "| f_dkey | timestamp | value | env | service | host |", - "+--------+---------------------+-------+------+---------+--------+", - "| A | 2023-01-01T09:00:00 | 95.5 | dev | log | host-y |", - "| A | 2023-01-01T09:00:10 | 102.3 | dev | log | host-y |", - "| A | 2023-01-01T09:00:20 | 98.7 | dev | log | host-y |", - "| A | 2023-01-01T09:12:20 | 105.1 | dev | log | host-y |", - "| A | 2023-01-01T09:12:30 | 100.0 | dev | log | host-y |", - "| A | 2023-01-01T09:12:40 | 150.0 | dev | log | host-y |", - "| A | 2023-01-01T09:12:50 | 120.8 | dev | log | host-y |", - "| B | 2023-01-01T11:00:00 | 72.8 | prod | log | host-x |", - "| B | 2023-01-01T11:00:10 | 79.4 | prod | log | host-x |", - "| B | 2023-01-01T11:00:20 | 76.1 | prod | log | host-x |", - "| B | 2023-01-01T11:00:30 | 83.7 | prod | log | host-x |", - "| B | 2023-01-01T11:12:30 | 77.2 | prod | log | host-x |", - "| B | 2023-01-01T09:00:00 | 75.2 | prod | log | host-x |", - "| B | 2023-01-01T09:00:10 | 82.4 | prod | log | host-x |", - "| B | 2023-01-01T09:00:20 | 78.9 | prod | log | host-x |", - "| B | 2023-01-01T09:00:30 | 85.6 | prod | log | host-x |", - "| B | 2023-01-01T09:12:30 | 80.0 | prod | log | host-x |", - "| B | 2023-01-01T09:12:40 | 120.0 | prod | log | host-x |", - "| B | 2023-01-01T09:12:50 | 92.3 | prod | log | host-x |", - "| B | 2023-01-01T10:00:00 | 88.5 | prod | log | host-x |", - "| B | 2023-01-01T10:00:10 | 91.2 | prod | log | host-x |", - "| B | 2023-01-01T10:00:20 | 87.3 | prod | log | host-x |", - "| B | 2023-01-01T10:00:30 | 94.1 | prod | log | host-x |", - "| B | 2023-01-01T10:12:30 | 89.5 | prod | log | host-x |", - "| B | 2023-01-01T10:12:40 | 95.8 | prod | log | host-x |", - "+--------+---------------------+-------+------+---------+--------+", - ]; + let pretty_results = pretty_format_batches(&distributed_results)?; + assert_snapshot!(pretty_results, + @" + +--------+---------------------+-------+------+---------+--------+ + | f_dkey | timestamp | value | env | service | host | + +--------+---------------------+-------+------+---------+--------+ + | A | 2023-01-01T09:00:00 | 95.5 | dev | log | host-y | + | A | 2023-01-01T09:00:10 | 102.3 | dev | log | host-y | + | A | 2023-01-01T09:00:20 | 98.7 | dev | log | host-y | + | A | 2023-01-01T09:12:20 | 105.1 | dev | log | host-y | + | A | 2023-01-01T09:12:30 | 100.0 | dev | log | host-y | + | A | 2023-01-01T09:12:40 | 150.0 | dev | log | host-y | + | A | 2023-01-01T09:12:50 | 120.8 | dev | log | host-y | + | B | 2023-01-01T09:00:00 | 75.2 | prod | log | host-x | + | B | 2023-01-01T09:00:10 | 82.4 | prod | log | host-x | + | B | 2023-01-01T09:00:20 | 78.9 | prod | log | host-x | + | B | 2023-01-01T09:00:30 | 85.6 | prod | log | host-x | + | B | 2023-01-01T09:12:30 | 80.0 | prod | log | host-x | + | B | 2023-01-01T09:12:40 | 120.0 | prod | log | host-x | + | B | 2023-01-01T09:12:50 | 92.3 | prod | log | host-x | + | B | 2023-01-01T10:00:00 | 88.5 | prod | log | host-x | + | B | 2023-01-01T10:00:10 | 91.2 | prod | log | host-x | + | B | 2023-01-01T10:00:20 | 87.3 | prod | log | host-x | + | B | 2023-01-01T10:00:30 | 94.1 | prod | log | host-x | + | B | 2023-01-01T10:12:30 | 89.5 | prod | log | host-x | + | B | 2023-01-01T10:12:40 | 95.8 | prod | log | host-x | + | B | 2023-01-01T11:00:00 | 72.8 | prod | log | host-x | + | B | 2023-01-01T11:00:10 | 79.4 | prod | log | host-x | + | B | 2023-01-01T11:00:20 | 76.1 | prod | log | host-x | + | B | 2023-01-01T11:00:30 | 83.7 | prod | log | host-x | + | B | 2023-01-01T11:12:30 | 77.2 | prod | log | host-x | + +--------+---------------------+-------+------+---------+--------+ + "); - assert_batches_sorted_eq!(expected, &distributed_results); Ok(()) } @@ -202,6 +192,7 @@ mod tests { WHERE service = 'log' ) AS j GROUP BY f_dkey, time_bin, env + ORDER BY f_dkey, time_bin "#; // ————————————————————————————————————————————————————————————— @@ -220,48 +211,49 @@ mod tests { // hive-style partitioning and avoiding data-shuffling repartitions. // ————————————————————————————————————————————————————————————— - let target_plan = r#"┌───── DistributedExec ── Tasks: t0:[p0] -│ CoalescePartitionsExec -│ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 -└────────────────────────────────────────────────── - ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] - │ ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] - │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) - │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] - │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] - │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/dim/d_dkey=A/data0.parquet], [testdata/join/parquet/dim/d_dkey=B/data0.parquet], [testdata/join/parquet/dim/d_dkey=C/data0.parquet], [testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] - └────────────────────────────────────────────────── - "#; - validate_plan(distributed_plan, target_plan); + assert_snapshot!(&distributed_plan, @r#" + ┌───── DistributedExec ── Tasks: t0:[p0] + │ SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST] + │ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 + └────────────────────────────────────────────────── + ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] + │ SortExec: expr=[f_dkey@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true] + │ ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] + │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) + │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] + │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data2.parquet, /testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + └────────────────────────────────────────────────── + "#); // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. // ————————————————————————————————————————————————————————————— - let expected = vec![ - "+--------+---------------------+------+---------------+", - "| f_dkey | time_bin | env | max_bin_value |", - "+--------+---------------------+------+---------------+", - "| A | 2023-01-01T09:00:00 | dev | 102.3 |", - "| A | 2023-01-01T09:12:00 | dev | 105.1 |", - "| A | 2023-01-01T09:12:30 | dev | 150.0 |", - "| B | 2023-01-01T11:00:00 | prod | 79.4 |", - "| B | 2023-01-01T11:00:30 | prod | 83.7 |", - "| B | 2023-01-01T11:12:30 | prod | 77.2 |", - "| B | 2023-01-01T09:00:00 | prod | 82.4 |", - "| B | 2023-01-01T09:00:30 | prod | 85.6 |", - "| B | 2023-01-01T09:12:30 | prod | 120.0 |", - "| B | 2023-01-01T10:00:00 | prod | 91.2 |", - "| B | 2023-01-01T10:00:30 | prod | 94.1 |", - "| B | 2023-01-01T10:12:30 | prod | 95.8 |", - "+--------+---------------------+------+---------------+", - ]; + let pretty_results = pretty_format_batches(&distributed_results)?; + assert_snapshot!(pretty_results, @" + +--------+---------------------+------+---------------+ + | f_dkey | time_bin | env | max_bin_value | + +--------+---------------------+------+---------------+ + | A | 2023-01-01T09:00:00 | dev | 102.3 | + | A | 2023-01-01T09:12:00 | dev | 105.1 | + | A | 2023-01-01T09:12:30 | dev | 150.0 | + | B | 2023-01-01T09:00:00 | prod | 82.4 | + | B | 2023-01-01T09:00:30 | prod | 85.6 | + | B | 2023-01-01T09:12:30 | prod | 120.0 | + | B | 2023-01-01T10:00:00 | prod | 91.2 | + | B | 2023-01-01T10:00:30 | prod | 94.1 | + | B | 2023-01-01T10:12:30 | prod | 95.8 | + | B | 2023-01-01T11:00:00 | prod | 79.4 | + | B | 2023-01-01T11:00:30 | prod | 83.7 | + | B | 2023-01-01T11:12:30 | prod | 77.2 | + +--------+---------------------+------+---------------+ + "); - assert_batches_sorted_eq!(expected, &distributed_results); Ok(()) } @@ -310,54 +302,54 @@ mod tests { // hive-style partitioning and avoiding data-shuffling repartitions. // ————————————————————————————————————————————————————————————— - let target_plan = r#"┌───── DistributedExec ── Tasks: t0:[p0] -│ SortPreservingMergeExec: [env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST] -│ SortExec: expr=[env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true] -│ ProjectionExec: expr=[env@0 as env, time_bin@1 as time_bin, avg(a.max_bin_value)@2 as avg_max_value] -│ AggregateExec: mode=FinalPartitioned, gby=[env@0 as env, time_bin@1 as time_bin], aggr=[avg(a.max_bin_value)] -│ [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=2 -└────────────────────────────────────────────────── - ┌───── Stage 1 ── Tasks: t0:[p0..p3] t1:[p0..p3] - │ CoalesceBatchesExec: target_batch_size=8192 - │ RepartitionExec: partitioning=Hash([env@0, time_bin@1], 4), input_partitions=2 - │ AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)] - │ ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] - │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) - │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] - │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] - │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/dim/d_dkey=A/data0.parquet], [testdata/join/parquet/dim/d_dkey=B/data0.parquet], [testdata/join/parquet/dim/d_dkey=C/data0.parquet], [testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[testdata/join/parquet/fact/f_dkey=A/data0.parquet], [testdata/join/parquet/fact/f_dkey=B/data2.parquet, testdata/join/parquet/fact/f_dkey=B/data0.parquet, testdata/join/parquet/fact/f_dkey=B/data1.parquet], [testdata/join/parquet/fact/f_dkey=C/data0.parquet, testdata/join/parquet/fact/f_dkey=C/data1.parquet], [testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] - └────────────────────────────────────────────────── - "#; - validate_plan(distributed_plan, target_plan); + assert_snapshot!(&distributed_plan, @r#" + ┌───── DistributedExec ── Tasks: t0:[p0] + │ SortPreservingMergeExec: [env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST] + │ SortExec: expr=[env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true] + │ ProjectionExec: expr=[env@0 as env, time_bin@1 as time_bin, avg(a.max_bin_value)@2 as avg_max_value] + │ AggregateExec: mode=FinalPartitioned, gby=[env@0 as env, time_bin@1 as time_bin], aggr=[avg(a.max_bin_value)] + │ [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=2 + └────────────────────────────────────────────────── + ┌───── Stage 1 ── Tasks: t0:[p0..p3] t1:[p0..p3] + │ CoalesceBatchesExec: target_batch_size=8192 + │ RepartitionExec: partitioning=Hash([env@0, time_bin@1], 4), input_partitions=2 + │ AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)] + │ ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] + │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) + │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] + │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data2.parquet, /testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + └────────────────────────────────────────────────── + "#); // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. // ————————————————————————————————————————————————————————————— - let expected = vec![ - "+------+---------------------+---------------+", - "| env | time_bin | avg_max_value |", - "+------+---------------------+---------------+", - "| dev | 2023-01-01T09:00:00 | 102.3 |", - "| dev | 2023-01-01T09:12:00 | 105.1 |", - "| dev | 2023-01-01T09:12:30 | 150.0 |", - "| prod | 2023-01-01T09:00:00 | 82.4 |", - "| prod | 2023-01-01T09:00:30 | 85.6 |", - "| prod | 2023-01-01T09:12:30 | 120.0 |", - "| prod | 2023-01-01T10:00:00 | 91.2 |", - "| prod | 2023-01-01T10:00:30 | 94.1 |", - "| prod | 2023-01-01T10:12:30 | 95.8 |", - "| prod | 2023-01-01T11:00:00 | 79.4 |", - "| prod | 2023-01-01T11:00:30 | 83.7 |", - "| prod | 2023-01-01T11:12:30 | 77.2 |", - "+------+---------------------+---------------+", - ]; + let pretty_results = pretty_format_batches(&distributed_results)?; + assert_snapshot!(pretty_results, @" + +------+---------------------+---------------+ + | env | time_bin | avg_max_value | + +------+---------------------+---------------+ + | dev | 2023-01-01T09:00:00 | 102.3 | + | dev | 2023-01-01T09:12:00 | 105.1 | + | dev | 2023-01-01T09:12:30 | 150.0 | + | prod | 2023-01-01T09:00:00 | 82.4 | + | prod | 2023-01-01T09:00:30 | 85.6 | + | prod | 2023-01-01T09:12:30 | 120.0 | + | prod | 2023-01-01T10:00:00 | 91.2 | + | prod | 2023-01-01T10:00:30 | 94.1 | + | prod | 2023-01-01T10:12:30 | 95.8 | + | prod | 2023-01-01T11:00:00 | 79.4 | + | prod | 2023-01-01T11:00:30 | 83.7 | + | prod | 2023-01-01T11:12:30 | 77.2 | + +------+---------------------+---------------+ + "); - assert_batches_sorted_eq!(expected, &distributed_results); Ok(()) } } From e30426715545bbd06d1463b26f2018f37e134449 Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Tue, 30 Dec 2025 13:57:42 -0500 Subject: [PATCH 13/14] Nulls last instead of nulls first. --- tests/join.rs | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/tests/join.rs b/tests/join.rs index 5523da1f..d223151f 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -55,10 +55,9 @@ mod tests { // Register hive-style partitioning for the fact table. let fact_options = ParquetReadOptions::default() .table_partition_cols(vec![("f_dkey".to_string(), DataType::Utf8)]) - // TODO: Figure out why file sort order does not display in plan. .file_sort_order(vec![vec![ - col("f_dkey").sort(true, true), - col("timestamp").sort(true, true), + col("f_dkey").sort(true, false), + col("timestamp").sort(true, false), ]]); ctx.register_parquet("fact", "testdata/join/parquet/fact", fact_options) .await?; @@ -119,14 +118,13 @@ mod tests { │ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 └────────────────────────────────────────────────── ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] - │ SortExec: expr=[f_dkey@0 ASC NULLS LAST, timestamp@1 ASC NULLS LAST], preserve_partitioning=[true] - │ ProjectionExec: expr=[f_dkey@5 as f_dkey, timestamp@3 as timestamp, value@4 as value, env@0 as env, service@1 as service, host@2 as host] - │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@3, f_dkey@2)], projection=[env@0, service@1, host@2, timestamp@4, value@5, f_dkey@6] - │ FilterExec: service@1 = log - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, host, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ ProjectionExec: expr=[f_dkey@5 as f_dkey, timestamp@3 as timestamp, value@4 as value, env@0 as env, service@1 as service, host@2 as host] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@3, f_dkey@2)], projection=[env@0, service@1, host@2, timestamp@4, value@5, f_dkey@6] + │ FilterExec: service@1 = log │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data2.parquet, /testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, host, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet, /testdata/join/parquet/fact/f_dkey=B/data2.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "); @@ -217,16 +215,15 @@ mod tests { │ [Stage 1] => NetworkCoalesceExec: output_partitions=4, input_tasks=2 └────────────────────────────────────────────────── ┌───── Stage 1 ── Tasks: t0:[p0..p1] t1:[p2..p3] - │ SortExec: expr=[f_dkey@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST], preserve_partitioning=[true] - │ ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] - │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) - │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] - │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] - │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ ProjectionExec: expr=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] + │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) + │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] + │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data2.parquet, /testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet, /testdata/join/parquet/fact/f_dkey=B/data2.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "#); @@ -322,7 +319,7 @@ mod tests { │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data2.parquet, /testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], file_type=parquet, predicate=DynamicFilter [ empty ] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet, /testdata/join/parquet/fact/f_dkey=B/data2.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "#); From 3709fce19544dacbdf5a1d66824769a87182d3bd Mon Sep 17 00:00:00 2001 From: Justin O'Dwyer Date: Wed, 21 Jan 2026 20:46:44 -0500 Subject: [PATCH 14/14] Fix tests. --- testdata/join/csv/fact/f_dkey=B/data1.csv | 8 -- testdata/join/csv/fact/f_dkey=B/data2.csv | 7 -- testdata/join/csv/fact/f_dkey=C/data1.csv | 9 -- testdata/join/generate_parquet_from_csv.sql | 12 --- .../join/parquet/fact/f_dkey=B/data1.parquet | 3 - .../join/parquet/fact/f_dkey=B/data2.parquet | 3 - .../join/parquet/fact/f_dkey=C/data1.parquet | 3 - tests/join.rs | 89 ++++--------------- 8 files changed, 19 insertions(+), 115 deletions(-) delete mode 100644 testdata/join/csv/fact/f_dkey=B/data1.csv delete mode 100644 testdata/join/csv/fact/f_dkey=B/data2.csv delete mode 100644 testdata/join/csv/fact/f_dkey=C/data1.csv delete mode 100644 testdata/join/parquet/fact/f_dkey=B/data1.parquet delete mode 100644 testdata/join/parquet/fact/f_dkey=B/data2.parquet delete mode 100644 testdata/join/parquet/fact/f_dkey=C/data1.parquet diff --git a/testdata/join/csv/fact/f_dkey=B/data1.csv b/testdata/join/csv/fact/f_dkey=B/data1.csv deleted file mode 100644 index 2af219e4..00000000 --- a/testdata/join/csv/fact/f_dkey=B/data1.csv +++ /dev/null @@ -1,8 +0,0 @@ -timestamp,value -2023-01-01T10:00:00,88.5 -2023-01-01T10:00:10,91.2 -2023-01-01T10:00:20,87.3 -2023-01-01T10:00:30,94.1 -2023-01-01T10:12:30,89.5 -2023-01-01T10:12:40,95.8 - diff --git a/testdata/join/csv/fact/f_dkey=B/data2.csv b/testdata/join/csv/fact/f_dkey=B/data2.csv deleted file mode 100644 index 3f807744..00000000 --- a/testdata/join/csv/fact/f_dkey=B/data2.csv +++ /dev/null @@ -1,7 +0,0 @@ -timestamp,value -2023-01-01T11:00:00,72.8 -2023-01-01T11:00:10,79.4 -2023-01-01T11:00:20,76.1 -2023-01-01T11:00:30,83.7 -2023-01-01T11:12:30,77.2 - diff --git a/testdata/join/csv/fact/f_dkey=C/data1.csv b/testdata/join/csv/fact/f_dkey=C/data1.csv deleted file mode 100644 index 1c1ff5a7..00000000 --- a/testdata/join/csv/fact/f_dkey=C/data1.csv +++ /dev/null @@ -1,9 +0,0 @@ -timestamp,value -2023-01-01T11:00:00,295.3 -2023-01-01T11:00:10,318.6 -2023-01-01T11:00:20,342.9 -2023-01-01T11:00:30,287.4 -2023-01-01T11:00:40,365.2 -2023-01-01T11:12:40,310.8 -2023-01-01T11:12:50,298.1 - diff --git a/testdata/join/generate_parquet_from_csv.sql b/testdata/join/generate_parquet_from_csv.sql index 7137ff6d..e4af6065 100644 --- a/testdata/join/generate_parquet_from_csv.sql +++ b/testdata/join/generate_parquet_from_csv.sql @@ -26,22 +26,10 @@ COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=B/data0.csv") TO "testdata/join/parquet/fact/f_dkey=B/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=B/data1.csv") -TO "testdata/join/parquet/fact/f_dkey=B/data1.parquet" -STORED AS PARQUET; - -COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=B/data2.csv") -TO "testdata/join/parquet/fact/f_dkey=B/data2.parquet" -STORED AS PARQUET; - COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=C/data0.csv") TO "testdata/join/parquet/fact/f_dkey=C/data0.parquet" STORED AS PARQUET; -COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=C/data1.csv") -TO "testdata/join/parquet/fact/f_dkey=C/data1.parquet" -STORED AS PARQUET; - COPY (SELECT * FROM "testdata/join/csv/fact/f_dkey=D/data0.csv") TO "testdata/join/parquet/fact/f_dkey=D/data0.parquet" STORED AS PARQUET; diff --git a/testdata/join/parquet/fact/f_dkey=B/data1.parquet b/testdata/join/parquet/fact/f_dkey=B/data1.parquet deleted file mode 100644 index fba94ef8..00000000 --- a/testdata/join/parquet/fact/f_dkey=B/data1.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:440f0bbac5647184e5420be6366ef443824885dd74989cbde08b2de245a5e359 -size 920 diff --git a/testdata/join/parquet/fact/f_dkey=B/data2.parquet b/testdata/join/parquet/fact/f_dkey=B/data2.parquet deleted file mode 100644 index 071ecb89..00000000 --- a/testdata/join/parquet/fact/f_dkey=B/data2.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a71b0c6efe0bf341558e7895db364ea045451729e7ea41957dd25851d2193aa -size 913 diff --git a/testdata/join/parquet/fact/f_dkey=C/data1.parquet b/testdata/join/parquet/fact/f_dkey=C/data1.parquet deleted file mode 100644 index 9825ab19..00000000 --- a/testdata/join/parquet/fact/f_dkey=C/data1.parquet +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:275f89a315e5131dea43789a21ff5fed2ffec4b8337558a76189c8e390826785 -size 936 diff --git a/tests/join.rs b/tests/join.rs index d223151f..5fe97ebd 100644 --- a/tests/join.rs +++ b/tests/join.rs @@ -15,7 +15,7 @@ mod tests { test_utils::localhost::start_localhost_context, }; - fn set_configs(ctx: &SessionContext) { + fn set_configs(ctx: &mut SessionContext) { // Preserve hive-style file partitions. ctx.state_ref() .write() @@ -95,22 +95,16 @@ mod tests { ORDER BY f_dkey, timestamp "#; - // ————————————————————————————————————————————————————————————— // Execute the query using distributed datafusion, 2 workers, // and hive-style partitioned data. - // ————————————————————————————————————————————————————————————— - - let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; - set_configs(&distributed_ctx); + let (mut distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; + set_configs(&mut distributed_ctx); register_tables(&distributed_ctx).await?; let (distributed_plan, distributed_results) = execute_query(&distributed_ctx, query).await?; - // ————————————————————————————————————————————————————————————— // Ensure the distributed plan matches our target plan, registering // hive-style partitioning and avoiding data-shuffling repartitions. - // ————————————————————————————————————————————————————————————— - assert_snapshot!(&distributed_plan, @" ┌───── DistributedExec ── Tasks: t0:[p0] @@ -124,14 +118,11 @@ mod tests { │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, host, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet, /testdata/join/parquet/fact/f_dkey=B/data2.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "); - // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. - // ————————————————————————————————————————————————————————————— - let pretty_results = pretty_format_batches(&distributed_results)?; assert_snapshot!(pretty_results, @" @@ -152,17 +143,6 @@ mod tests { | B | 2023-01-01T09:12:30 | 80.0 | prod | log | host-x | | B | 2023-01-01T09:12:40 | 120.0 | prod | log | host-x | | B | 2023-01-01T09:12:50 | 92.3 | prod | log | host-x | - | B | 2023-01-01T10:00:00 | 88.5 | prod | log | host-x | - | B | 2023-01-01T10:00:10 | 91.2 | prod | log | host-x | - | B | 2023-01-01T10:00:20 | 87.3 | prod | log | host-x | - | B | 2023-01-01T10:00:30 | 94.1 | prod | log | host-x | - | B | 2023-01-01T10:12:30 | 89.5 | prod | log | host-x | - | B | 2023-01-01T10:12:40 | 95.8 | prod | log | host-x | - | B | 2023-01-01T11:00:00 | 72.8 | prod | log | host-x | - | B | 2023-01-01T11:00:10 | 79.4 | prod | log | host-x | - | B | 2023-01-01T11:00:20 | 76.1 | prod | log | host-x | - | B | 2023-01-01T11:00:30 | 83.7 | prod | log | host-x | - | B | 2023-01-01T11:12:30 | 77.2 | prod | log | host-x | +--------+---------------------+-------+------+---------+--------+ "); @@ -193,22 +173,16 @@ mod tests { ORDER BY f_dkey, time_bin "#; - // ————————————————————————————————————————————————————————————— // Execute the query using distributed datafusion, 2 workers, // and hive-style partitioned data. - // ————————————————————————————————————————————————————————————— - - let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; - set_configs(&distributed_ctx); + let (mut distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; + set_configs(&mut distributed_ctx); register_tables(&distributed_ctx).await?; let (distributed_plan, distributed_results) = execute_query(&distributed_ctx, query).await?; - // ————————————————————————————————————————————————————————————— // Ensure the distributed plan matches our target plan, registering // hive-style partitioning and avoiding data-shuffling repartitions. - // ————————————————————————————————————————————————————————————— - assert_snapshot!(&distributed_plan, @r#" ┌───── DistributedExec ── Tasks: t0:[p0] │ SortPreservingMergeExec: [f_dkey@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST] @@ -223,14 +197,11 @@ mod tests { │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet, /testdata/join/parquet/fact/f_dkey=B/data2.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "#); - // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. - // ————————————————————————————————————————————————————————————— - let pretty_results = pretty_format_batches(&distributed_results)?; assert_snapshot!(pretty_results, @" +--------+---------------------+------+---------------+ @@ -242,12 +213,6 @@ mod tests { | B | 2023-01-01T09:00:00 | prod | 82.4 | | B | 2023-01-01T09:00:30 | prod | 85.6 | | B | 2023-01-01T09:12:30 | prod | 120.0 | - | B | 2023-01-01T10:00:00 | prod | 91.2 | - | B | 2023-01-01T10:00:30 | prod | 94.1 | - | B | 2023-01-01T10:12:30 | prod | 95.8 | - | B | 2023-01-01T11:00:00 | prod | 79.4 | - | B | 2023-01-01T11:00:30 | prod | 83.7 | - | B | 2023-01-01T11:12:30 | prod | 77.2 | +--------+---------------------+------+---------------+ "); @@ -283,22 +248,16 @@ mod tests { ORDER BY env, time_bin "#; - // ————————————————————————————————————————————————————————————— // Execute the query using distributed datafusion, 2 workers, // and hive-style partitioned data. - // ————————————————————————————————————————————————————————————— - - let (distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; - set_configs(&distributed_ctx); + let (mut distributed_ctx, _guard) = start_localhost_context(2, DefaultSessionBuilder).await; + set_configs(&mut distributed_ctx); register_tables(&distributed_ctx).await?; let (distributed_plan, distributed_results) = execute_query(&distributed_ctx, query).await?; - // ————————————————————————————————————————————————————————————— // Ensure the distributed plan matches our target plan, registering // hive-style partitioning and avoiding data-shuffling repartitions. - // ————————————————————————————————————————————————————————————— - assert_snapshot!(&distributed_plan, @r#" ┌───── DistributedExec ── Tasks: t0:[p0] │ SortPreservingMergeExec: [env@0 ASC NULLS LAST, time_bin@1 ASC NULLS LAST] @@ -308,25 +267,21 @@ mod tests { │ [Stage 1] => NetworkShuffleExec: output_partitions=4, input_tasks=2 └────────────────────────────────────────────────── ┌───── Stage 1 ── Tasks: t0:[p0..p3] t1:[p0..p3] - │ CoalesceBatchesExec: target_batch_size=8192 - │ RepartitionExec: partitioning=Hash([env@0, time_bin@1], 4), input_partitions=2 - │ AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)] - │ ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] - │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) - │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] - │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] - │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] - │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ RepartitionExec: partitioning=Hash([env@0, time_bin@1], 4), input_partitions=2 + │ AggregateExec: mode=Partial, gby=[env@1 as env, time_bin@0 as time_bin], aggr=[avg(a.max_bin_value)] + │ ProjectionExec: expr=[date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp)@1 as time_bin, env@2 as env, max(j.value)@3 as max_bin_value] + │ AggregateExec: mode=SinglePartitioned, gby=[f_dkey@0 as f_dkey, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }, timestamp@2) as date_bin(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 30000000000 }"),j.timestamp), env@1 as env], aggr=[max(j.value)], ordering_mode=PartiallySorted([0, 1]) + │ ProjectionExec: expr=[f_dkey@3 as f_dkey, env@0 as env, timestamp@1 as timestamp, value@2 as value] + │ HashJoinExec: mode=Partitioned, join_type=Inner, on=[(d_dkey@1, f_dkey@2)], projection=[env@0, timestamp@2, value@3, f_dkey@4] + │ FilterExec: service@1 = log, projection=[env@0, d_dkey@2] │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] - │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet, /testdata/join/parquet/fact/f_dkey=B/data1.parquet, /testdata/join/parquet/fact/f_dkey=B/data2.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet, /testdata/join/parquet/fact/f_dkey=C/data1.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/dim/d_dkey=A/data0.parquet], [/testdata/join/parquet/dim/d_dkey=B/data0.parquet], [/testdata/join/parquet/dim/d_dkey=C/data0.parquet], [/testdata/join/parquet/dim/d_dkey=D/data0.parquet]]}, projection=[env, service, d_dkey], file_type=parquet, predicate=service@1 = log, pruning_predicate=service_null_count@2 != row_count@3 AND service_min@0 <= log AND log <= service_max@1, required_guarantees=[service in (log)] + │ PartitionIsolatorExec: t0:[p0,p1,__,__] t1:[__,__,p0,p1] + │ DataSourceExec: file_groups={4 groups: [[/testdata/join/parquet/fact/f_dkey=A/data0.parquet], [/testdata/join/parquet/fact/f_dkey=B/data0.parquet], [/testdata/join/parquet/fact/f_dkey=C/data0.parquet], [/testdata/join/parquet/fact/f_dkey=D/data0.parquet]]}, projection=[timestamp, value, f_dkey], output_ordering=[f_dkey@2 ASC NULLS LAST, timestamp@0 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] └────────────────────────────────────────────────── "#); - // ————————————————————————————————————————————————————————————— // Ensure distributed results are correct. - // ————————————————————————————————————————————————————————————— - let pretty_results = pretty_format_batches(&distributed_results)?; assert_snapshot!(pretty_results, @" +------+---------------------+---------------+ @@ -338,12 +293,6 @@ mod tests { | prod | 2023-01-01T09:00:00 | 82.4 | | prod | 2023-01-01T09:00:30 | 85.6 | | prod | 2023-01-01T09:12:30 | 120.0 | - | prod | 2023-01-01T10:00:00 | 91.2 | - | prod | 2023-01-01T10:00:30 | 94.1 | - | prod | 2023-01-01T10:12:30 | 95.8 | - | prod | 2023-01-01T11:00:00 | 79.4 | - | prod | 2023-01-01T11:00:30 | 83.7 | - | prod | 2023-01-01T11:12:30 | 77.2 | +------+---------------------+---------------+ ");