apache · Sephiroth-Lin · Jul 15, 2015 · Jul 15, 2015 · Jul 16, 2015 · Jul 16, 2015
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -213,10 +213,51 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   object CartesianProduct extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.Join(left, right, _, None) =>
-        execution.joins.CartesianProduct(planLater(left), planLater(right)) :: Nil
+        // For BroadcastCartesianProduct we will broadcast the small size plan,
+        // for CartesianProduct we will use the small size plan as cartesian left rdd.
+        if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) {
+          if (sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
+            right.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold) {
+            execution.joins.BroadcastCartesianProduct(planLater(left), planLater(right),
+              joins.BuildRight) :: Nil
+          } else {
+            execution.joins.CartesianProduct(planLater(left), planLater(right),
+              joins.BuildLeft) :: Nil
+          }
+        } else {
+          if (sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
+            left.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold) {
+            execution.joins.BroadcastCartesianProduct(planLater(left), planLater(right),
+              joins.BuildLeft) :: Nil
+          } else {
+            execution.joins.CartesianProduct(planLater(left), planLater(right),
+              joins.BuildRight) :: Nil
+          }
+        }
       case logical.Join(left, right, Inner, Some(condition)) =>
-        execution.Filter(condition,
-          execution.joins.CartesianProduct(planLater(left), planLater(right))) :: Nil
+        if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) {
+          if (sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
+            right.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold) {
+            execution.Filter(condition,
+              execution.joins.BroadcastCartesianProduct(planLater(left), planLater(right),
+                joins.BuildRight)) :: Nil
+          } else {
+            execution.Filter(condition,
+              execution.joins.CartesianProduct(planLater(left), planLater(right),
+                joins.BuildLeft)) :: Nil
+          }
+        } else {
+          if (sqlContext.conf.autoBroadcastJoinThreshold > 0 &&
+            left.statistics.sizeInBytes <= sqlContext.conf.autoBroadcastJoinThreshold) {
+            execution.Filter(condition,
+              execution.joins.BroadcastCartesianProduct(planLater(left), planLater(right),
+                joins.BuildLeft)) :: Nil
+          } else {
+            execution.Filter(condition,
+              execution.joins.CartesianProduct(planLater(left), planLater(right),
+                joins.BuildRight)) :: Nil
+          }
+        }
       case _ => Nil
     }
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastCartesianProduct.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastCartesianProduct.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.joins
+
+import scala.concurrent._
+import scala.concurrent.duration._
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow}
+import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * :: DeveloperApi ::
+ */
+@DeveloperApi
+case class BroadcastCartesianProduct(
+    left: SparkPlan,
+    right: SparkPlan,
+    buildSide: BuildSide) extends BinaryNode {
+  override def output: Seq[Attribute] = left.output ++ right.output
+
+  private val (streamed, broadcast) = buildSide match {
+    case BuildRight => (left, right)
+    case BuildLeft => (right, left)
+  }
+
+  private val timeout: Duration = {
+    val timeoutValue = sqlContext.conf.broadcastTimeout
+    if (timeoutValue < 0) {
+      Duration.Inf
+    } else {
+      timeoutValue.seconds
+    }
+  }
+
+  @transient
+  private val broadcastFuture = future {
+    val input = broadcast.execute().map(_.copy()).collect()
+    sparkContext.broadcast(input)
+  }(BroadcastCartesianProduct.broadcastCartesianProductExecutionContext)
+
+  protected override def doExecute(): RDD[InternalRow] = {
+    val leftResults = streamed.execute().map(_.copy())
+    val rightResults = Await.result(broadcastFuture, timeout)
+
+    leftResults.mapPartitions { streamedIter =>
+      for (x <- streamedIter; y <- rightResults.value)
+        yield {
+          val joinedRow = new JoinedRow
+          buildSide match {
+            case BuildRight => joinedRow(x, y)
+            case BuildLeft => joinedRow(y, x)
+          }
+        }
+    }
+  }
+}
+
+object BroadcastCartesianProduct {
+  private val broadcastCartesianProductExecutionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("broadcast-cartesian-product", 128))
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
@@ -27,16 +27,27 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
  * :: DeveloperApi ::
  */
 @DeveloperApi
-case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode {
+case class CartesianProduct(
+    left: SparkPlan,
+    right: SparkPlan,
+    buildSide: BuildSide) extends BinaryNode {
   override def output: Seq[Attribute] = left.output ++ right.output
 
+  private val (small, big) = buildSide match {
+    case BuildRight => (left, right)
+    case BuildLeft => (right, left)
+  }
+
   protected override def doExecute(): RDD[InternalRow] = {
-    val leftResults = left.execute().map(_.copy())
-    val rightResults = right.execute().map(_.copy())
+    val leftResults = small.execute().map(_.copy())
+    val rightResults = big.execute().map(_.copy())
 
     leftResults.cartesian(rightResults).mapPartitions { iter =>
       val joinedRow = new JoinedRow
-      iter.map(r => joinedRow(r._1, r._2))
+      buildSide match {
+        case BuildRight => iter.map(r => joinedRow(r._1, r._2))
+        case BuildLeft => iter.map(r => joinedRow(r._2, r._1))
+      }
     }
   }
 }