@@ -1390,7 +1390,7 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
1390
1390
}
1391
1391
}
1392
1392
1393
- test(" SPARK-44647: test join key is subset of cluster key " +
1393
+ test(" SPARK-44647: SPJ: test join key is subset of cluster key " +
1394
1394
" with push values and partially-clustered" ) {
1395
1395
val table1 = " tab1e1"
1396
1396
val table2 = " table2"
@@ -1487,7 +1487,7 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
1487
1487
}
1488
1488
}
1489
1489
1490
- test(" SPARK-47094: Support compatible buckets" ) {
1490
+ test(" SPARK-47094: SPJ: Support compatible buckets" ) {
1491
1491
val table1 = " tab1e1"
1492
1492
val table2 = " table2"
1493
1493
@@ -1580,11 +1580,11 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
1580
1580
val shuffles = collectShuffles(df.queryExecution.executedPlan)
1581
1581
assert(shuffles.isEmpty, " SPJ should be triggered" )
1582
1582
1583
- val scans = collectScans(df.queryExecution.executedPlan).map(_.inputRDD.
1583
+ val partions = collectScans(df.queryExecution.executedPlan).map(_.inputRDD.
1584
1584
partitions.length)
1585
1585
val expectedBuckets = Math .min(table1buckets1, table2buckets1) *
1586
1586
Math .min(table1buckets2, table2buckets2)
1587
- assert(scans == Seq (expectedBuckets, expectedBuckets))
1587
+ assert(partions == Seq (expectedBuckets, expectedBuckets))
1588
1588
1589
1589
checkAnswer(df, Seq (
1590
1590
Row (0 , 0 , " aa" , " aa" ),
@@ -1647,7 +1647,7 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
1647
1647
}
1648
1648
}
1649
1649
1650
- test(" SPARK-47094: Support compatible buckets with common divisor" ) {
1650
+ test(" SPARK-47094: SPJ: Support compatible buckets with common divisor" ) {
1651
1651
val table1 = " tab1e1"
1652
1652
val table2 = " table2"
1653
1653
@@ -1744,9 +1744,9 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
1744
1744
partitions.length)
1745
1745
1746
1746
def gcd (a : Int , b : Int ): Int = BigInt (a).gcd(BigInt (b)).toInt
1747
- val expectedBuckets = gcd(table1buckets1, table2buckets1) *
1747
+ val expectedPartitions = gcd(table1buckets1, table2buckets1) *
1748
1748
gcd(table1buckets2, table2buckets2)
1749
- assert(scans == Seq (expectedBuckets, expectedBuckets ))
1749
+ assert(scans == Seq (expectedPartitions, expectedPartitions ))
1750
1750
1751
1751
checkAnswer(df, Seq (
1752
1752
Row (0 , 0 , " aa" , " aa" ),
@@ -1809,6 +1809,56 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
1809
1809
}
1810
1810
}
1811
1811
1812
+
1813
+ test(" SPARK-47094: SPJ: Does not trigger when incompatible number of buckets on both side" ) {
1814
+ val table1 = " tab1e1"
1815
+ val table2 = " table2"
1816
+
1817
+ Seq (
1818
+ (2 , 3 ),
1819
+ (3 , 4 )
1820
+ ).foreach {
1821
+ case (table1buckets1, table2buckets1) =>
1822
+ catalog.clearTables()
1823
+
1824
+ val partition1 = Array (bucket(table1buckets1, " store_id" ))
1825
+ val partition2 = Array (bucket(table2buckets1, " store_id" ))
1826
+
1827
+ Seq ((table1, partition1), (table2, partition2)).foreach { case (tab, part) =>
1828
+ createTable(tab, columns2, part)
1829
+ val insertStr = s " INSERT INTO testcat.ns. $tab VALUES " +
1830
+ " (0, 0, 'aa'), " +
1831
+ " (1, 0, 'ab'), " + // duplicate partition key
1832
+ " (2, 2, 'ac'), " +
1833
+ " (3, 3, 'ad'), " +
1834
+ " (4, 2, 'bc') "
1835
+
1836
+ sql(insertStr)
1837
+ }
1838
+
1839
+ Seq (true , false ).foreach { allowJoinKeysSubsetOfPartitionKeys =>
1840
+ withSQLConf(
1841
+ SQLConf .REQUIRE_ALL_CLUSTER_KEYS_FOR_CO_PARTITION .key -> " false" ,
1842
+ SQLConf .V2_BUCKETING_PUSH_PART_VALUES_ENABLED .key -> " true" ,
1843
+ SQLConf .V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED .key -> " false" ,
1844
+ SQLConf .V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS .key ->
1845
+ allowJoinKeysSubsetOfPartitionKeys.toString,
1846
+ SQLConf .V2_BUCKETING_ALLOW_COMPATIBLE_TRANSFORMS .key -> " true" ) {
1847
+ val df = sql(
1848
+ s """
1849
+ | ${selectWithMergeJoinHint(" t1" , " t2" )}
1850
+ |t1.store_id, t1.dept_id, t1.data, t2.data
1851
+ |FROM testcat.ns. $table1 t1 JOIN testcat.ns. $table2 t2
1852
+ |ON t1.store_id = t2.store_id AND t1.dept_id = t2.dept_id
1853
+ | """ .stripMargin)
1854
+
1855
+ val shuffles = collectShuffles(df.queryExecution.executedPlan)
1856
+ assert(shuffles.nonEmpty, " SPJ should not be triggered" )
1857
+ }
1858
+ }
1859
+ }
1860
+ }
1861
+
1812
1862
test(" SPARK-47094: Support compatible buckets with less join keys than partition keys" ) {
1813
1863
val table1 = " tab1e1"
1814
1864
val table2 = " table2"
0 commit comments