jkbradley
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala‎
Lines changed: 22 additions & 20 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/join/BroadcastHashJoin.scala‎
Lines changed: 62 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/join/BroadcastHashJoin.scala‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/join/BroadcastNestedLoopJoin.scala‎
Lines changed: 144 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/join/BroadcastNestedLoopJoin.scala‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/join/CartesianProduct.scala‎
Lines changed: 40 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/join/CartesianProduct.scala‎
Lines changed: 40 additions & 0 deletions
@@ -27,20 +27,19 @@ import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.columnar.{InMemoryRelation, InMemoryColumnarTableScan}
 import org.apache.spark.sql.parquet._
 
+
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   self: SQLContext#SparkPlanner =>
 
   object LeftSemiJoin extends Strategy with PredicateHelper {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       // Find left semi joins where at least some predicates can be evaluated by matching join keys
       case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right) =>
-        val semiJoin = execution.LeftSemiJoinHash(
-          leftKeys, rightKeys, planLater(left), planLater(right))
+        val semiJoin = join.LeftSemiJoinHash(leftKeys, rightKeys, planLater(left), planLater(right))
         condition.map(Filter(_, semiJoin)).getOrElse(semiJoin) :: Nil
       // no predicate can be evaluated by matching hash keys
       case logical.Join(left, right, LeftSemi, condition) =>
-        execution.LeftSemiJoinBNL(
-          planLater(left), planLater(right), condition) :: Nil
+        join.LeftSemiJoinBNL(planLater(left), planLater(right), condition) :: Nil
       case _ => Nil
     }
   }
@@ -50,13 +49,13 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
    * evaluated by matching hash keys.
    *
    * This strategy applies a simple optimization based on the estimates of the physical sizes of
-   * the two join sides.  When planning a [[execution.BroadcastHashJoin]], if one side has an
+   * the two join sides.  When planning a [[join.BroadcastHashJoin]], if one side has an
    * estimated physical size smaller than the user-settable threshold
    * [[org.apache.spark.sql.SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]], the planner would mark it as the
    * ''build'' relation and mark the other relation as the ''stream'' side.  The build table will be
    * ''broadcasted'' to all of the executors involved in the join, as a
    * [[org.apache.spark.broadcast.Broadcast]] object.  If both estimates exceed the threshold, they
-   * will instead be used to decide the build side in a [[execution.ShuffledHashJoin]].
+   * will instead be used to decide the build side in a [[join.ShuffledHashJoin]].
    */
   object HashJoin extends Strategy with PredicateHelper {
 
@@ -66,8 +65,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         left: LogicalPlan,
         right: LogicalPlan,
         condition: Option[Expression],
-        side: BuildSide) = {
-      val broadcastHashJoin = execution.BroadcastHashJoin(
+        side: join.BuildSide) = {
+      val broadcastHashJoin = execution.join.BroadcastHashJoin(
         leftKeys, rightKeys, side, planLater(left), planLater(right))
       condition.map(Filter(_, broadcastHashJoin)).getOrElse(broadcastHashJoin) :: Nil
     }
@@ -76,27 +75,26 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right)
         if sqlContext.autoBroadcastJoinThreshold > 0 &&
            right.statistics.sizeInBytes <= sqlContext.autoBroadcastJoinThreshold =>
-        makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildRight)
+        makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, join.BuildRight)
 
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right)
         if sqlContext.autoBroadcastJoinThreshold > 0 &&
            left.statistics.sizeInBytes <= sqlContext.autoBroadcastJoinThreshold =>
-          makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildLeft)
+          makeBroadcastHashJoin(leftKeys, rightKeys, left, right, condition, join.BuildLeft)
 
       case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) =>
         val buildSide =
           if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) {
-            BuildRight
+            join.BuildRight
           } else {
-            BuildLeft
+            join.BuildLeft
           }
-        val hashJoin =
-          execution.ShuffledHashJoin(
-            leftKeys, rightKeys, buildSide, planLater(left), planLater(right))
+        val hashJoin = join.ShuffledHashJoin(
+          leftKeys, rightKeys, buildSide, planLater(left), planLater(right))
         condition.map(Filter(_, hashJoin)).getOrElse(hashJoin) :: Nil
 
       case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right) =>
-        execution.HashOuterJoin(
+        join.HashOuterJoin(
           leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil
 
       case _ => Nil
@@ -164,8 +162,12 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.Join(left, right, joinType, condition) =>
         val buildSide =
-          if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) BuildRight else BuildLeft
-        execution.BroadcastNestedLoopJoin(
+          if (right.statistics.sizeInBytes <= left.statistics.sizeInBytes) {
+            join.BuildRight
+          } else {
+            join.BuildLeft
+          }
+        join.BroadcastNestedLoopJoin(
           planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
       case _ => Nil
     }
@@ -174,10 +176,10 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   object CartesianProduct extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.Join(left, right, _, None) =>
-        execution.CartesianProduct(planLater(left), planLater(right)) :: Nil
+        execution.join.CartesianProduct(planLater(left), planLater(right)) :: Nil
       case logical.Join(left, right, Inner, Some(condition)) =>
         execution.Filter(condition,
-          execution.CartesianProduct(planLater(left), planLater(right))) :: Nil
+          execution.join.CartesianProduct(planLater(left), planLater(right))) :: Nil
       case _ => Nil
     }
   }
 
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.join
+
+import scala.concurrent._
+import scala.concurrent.duration._
+import scala.concurrent.ExecutionContext.Implicits.global
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
+import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+
+/**
+ * :: DeveloperApi ::
+ * Performs an inner hash join of two child relations.  When the output RDD of this operator is
+ * being constructed, a Spark job is asynchronously started to calculate the values for the
+ * broadcasted relation.  This data is then placed in a Spark broadcast variable.  The streamed
+ * relation is not shuffled.
+ */
+@DeveloperApi
+case class BroadcastHashJoin(
+    leftKeys: Seq[Expression],
+    rightKeys: Seq[Expression],
+    buildSide: BuildSide,
+    left: SparkPlan,
+    right: SparkPlan)
+  extends BinaryNode with HashJoin {
+
+  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning
+
+  override def requiredChildDistribution =
+    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil
+
+  @transient
+  private val broadcastFuture = future {
+    sparkContext.broadcast(buildPlan.executeCollect())
+  }
+
+  override def execute() = {
+    val broadcastRelation = Await.result(broadcastFuture, 5.minute)
+
+    streamedPlan.execute().mapPartitions { streamedIter =>
+      joinIterators(broadcastRelation.value.iterator, streamedIter)
+    }
+  }
+}
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.join
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.catalyst.plans.{FullOuter, JoinType, LeftOuter, RightOuter}
+import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+import org.apache.spark.util.collection.CompactBuffer
+
+/**
+ * :: DeveloperApi ::
+ */
+@DeveloperApi
+case class BroadcastNestedLoopJoin(
+    left: SparkPlan,
+    right: SparkPlan,
+    buildSide: BuildSide,
+    joinType: JoinType,
+    condition: Option[Expression]) extends BinaryNode {
+  // TODO: Override requiredChildDistribution.
+
+  /** BuildRight means the right relation <=> the broadcast relation. */
+  private val (streamed, broadcast) = buildSide match {
+    case BuildRight => (left, right)
+    case BuildLeft => (right, left)
+  }
+
+  override def outputPartitioning: Partitioning = streamed.outputPartitioning
+
+  override def output = {
+    joinType match {
+      case LeftOuter =>
+        left.output ++ right.output.map(_.withNullability(true))
+      case RightOuter =>
+        left.output.map(_.withNullability(true)) ++ right.output
+      case FullOuter =>
+        left.output.map(_.withNullability(true)) ++ right.output.map(_.withNullability(true))
+      case _ =>
+        left.output ++ right.output
+    }
+  }
+
+  @transient private lazy val boundCondition =
+    InterpretedPredicate(
+      condition
+        .map(c => BindReferences.bindReference(c, left.output ++ right.output))
+        .getOrElse(Literal(true)))
+
+  override def execute() = {
+    val broadcastedRelation =
+      sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq)
+
+    /** All rows that either match both-way, or rows from streamed joined with nulls. */
+    val matchesOrStreamedRowsWithNulls = streamed.execute().mapPartitions { streamedIter =>
+      val matchedRows = new CompactBuffer[Row]
+      // TODO: Use Spark's BitSet.
+      val includedBroadcastTuples =
+        new scala.collection.mutable.BitSet(broadcastedRelation.value.size)
+      val joinedRow = new JoinedRow
+      val leftNulls = new GenericMutableRow(left.output.size)
+      val rightNulls = new GenericMutableRow(right.output.size)
+
+      streamedIter.foreach { streamedRow =>
+        var i = 0
+        var streamRowMatched = false
+
+        while (i < broadcastedRelation.value.size) {
+          // TODO: One bitset per partition instead of per row.
+          val broadcastedRow = broadcastedRelation.value(i)
+          buildSide match {
+            case BuildRight if boundCondition(joinedRow(streamedRow, broadcastedRow)) =>
+              matchedRows += joinedRow(streamedRow, broadcastedRow).copy()
+              streamRowMatched = true
+              includedBroadcastTuples += i
+            case BuildLeft if boundCondition(joinedRow(broadcastedRow, streamedRow)) =>
+              matchedRows += joinedRow(broadcastedRow, streamedRow).copy()
+              streamRowMatched = true
+              includedBroadcastTuples += i
+            case _ =>
+          }
+          i += 1
+        }
+
+        (streamRowMatched, joinType, buildSide) match {
+          case (false, LeftOuter | FullOuter, BuildRight) =>
+            matchedRows += joinedRow(streamedRow, rightNulls).copy()
+          case (false, RightOuter | FullOuter, BuildLeft) =>
+            matchedRows += joinedRow(leftNulls, streamedRow).copy()
+          case _ =>
+        }
+      }
+      Iterator((matchedRows, includedBroadcastTuples))
+    }
+
+    val includedBroadcastTuples = matchesOrStreamedRowsWithNulls.map(_._2)
+    val allIncludedBroadcastTuples =
+      if (includedBroadcastTuples.count == 0) {
+        new scala.collection.mutable.BitSet(broadcastedRelation.value.size)
+      } else {
+        includedBroadcastTuples.reduce(_ ++ _)
+      }
+
+    val leftNulls = new GenericMutableRow(left.output.size)
+    val rightNulls = new GenericMutableRow(right.output.size)
+    /** Rows from broadcasted joined with nulls. */
+    val broadcastRowsWithNulls: Seq[Row] = {
+      val buf: CompactBuffer[Row] = new CompactBuffer()
+      var i = 0
+      val rel = broadcastedRelation.value
+      while (i < rel.length) {
+        if (!allIncludedBroadcastTuples.contains(i)) {
+          (joinType, buildSide) match {
+            case (RightOuter | FullOuter, BuildRight) => buf += new JoinedRow(leftNulls, rel(i))
+            case (LeftOuter | FullOuter, BuildLeft) => buf += new JoinedRow(rel(i), rightNulls)
+            case _ =>
+          }
+        }
+        i += 1
+      }
+      buf.toSeq
+    }
+
+    // TODO: Breaks lineage.
+    sparkContext.union(
+      matchesOrStreamedRowsWithNulls.flatMap(_._1), sparkContext.makeRDD(broadcastRowsWithNulls))
+  }
+}
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.join
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.catalyst.expressions.JoinedRow
+import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
+
+/**
+ * :: DeveloperApi ::
+ */
+@DeveloperApi
+case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode {
+  override def output = left.output ++ right.output
+
+  override def execute() = {
+    val leftResults = left.execute().map(_.copy())
+    val rightResults = right.execute().map(_.copy())
+
+    leftResults.cartesian(rightResults).mapPartitions { iter =>
+      val joinedRow = new JoinedRow
+      iter.map(r => joinedRow(r._1, r._2))
+    }
+  }
+}