Merge branch 'master' into toStringNPE

dorx · dorx · commit f47fecfbbed3 · 2014-06-09T19:09:14.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.types.StructType
+import org.apache.spark.sql.catalyst.types.{StringType, StructType}
 import org.apache.spark.sql.catalyst.trees
 
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
@@ -102,7 +102,7 @@ abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] {
  */
 abstract class Command extends LeafNode {
   self: Product =>
-  def output = Seq.empty
+  def output: Seq[Attribute] = Seq.empty
 }
 
 /**
@@ -115,7 +115,9 @@ case class NativeCommand(cmd: String) extends Command
  * Returned by a parser when the users only wants to see what query plan would be executed, without
  * actually performing the execution.
  */
-case class ExplainCommand(plan: LogicalPlan) extends Command
+case class ExplainCommand(plan: LogicalPlan) extends Command {
+  override def output = Seq(AttributeReference("plan", StringType, nullable = false)())
+}
 
 /**
  * A logical plan node with single child.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -191,6 +191,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     val sparkContext = self.sparkContext
 
     val strategies: Seq[Strategy] =
+      CommandStrategy(self) ::
       TakeOrdered ::
       PartialAggregation ::
       LeftSemiJoin ::
@@ -256,6 +257,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
       Batch("Prepare Expressions", Once, new BindReferences[SparkPlan]) :: Nil
   }
 
+  // TODO: or should we make QueryExecution protected[sql]?
+  protected[sql] def mkQueryExecution(plan: LogicalPlan) = new QueryExecution {
+    val logical = plan
+  }
+
   /**
    * The primary workflow for executing relational queries using Spark.  Designed to allow easy
    * access to the intermediate phases of query execution for developers.
@@ -285,11 +291,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
          |== Physical Plan ==
          |${stringOrError(executedPlan)}
       """.stripMargin.trim
-
-    /**
-     * Runs the query after interposing operators that print the result of each intermediate step.
-     */
-    def debugExec() = DebugQuery(executedPlan).execute().collect()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -233,4 +233,15 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case _ => Nil
     }
   }
+
+  // TODO: this should be merged with SPARK-1508's SetCommandStrategy
+  case class CommandStrategy(context: SQLContext) extends Strategy {
+    def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case logical.ExplainCommand(child) =>
+        val qe = context.mkQueryExecution(child)
+        Seq(execution.ExplainCommandPhysical(qe.executedPlan, plan.output)(context))
+      case _ => Nil
+    }
+  }
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -17,29 +17,16 @@
 
 package org.apache.spark.sql.execution
 
-private[sql] object DebugQuery {
-  def apply(plan: SparkPlan): SparkPlan = {
-    val visited = new collection.mutable.HashSet[Long]()
-    plan transform {
-      case s: SparkPlan if !visited.contains(s.id) =>
-        visited += s.id
-        DebugNode(s)
-    }
-  }
-}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute}
 
-private[sql] case class DebugNode(child: SparkPlan) extends UnaryNode {
-  def references = Set.empty
-  def output = child.output
-  def execute() = {
-    val childRdd = child.execute()
-    println(
-      s"""
-        |=========================
-        |${child.simpleString}
-        |=========================
-      """.stripMargin)
-    childRdd.foreach(println(_))
-    childRdd
+case class ExplainCommandPhysical(child: SparkPlan, output: Seq[Attribute])
+                                 (@transient context: SQLContext) extends UnaryNode {
+  def execute(): RDD[Row] = {
+    val planString = new GenericRow(Array[Any](child.toString))
+    context.sparkContext.parallelize(Seq(planString))
   }
+
+  override def otherCopyArgs = context :: Nil
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import scala.collection.mutable.HashSet
+
+import org.apache.spark.{AccumulatorParam, Accumulator, SparkContext}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.SparkContext._
+import org.apache.spark.sql.{SchemaRDD, Row}
+
+/**
+ * :: DeveloperApi ::
+ * Contains methods for debugging query execution.
+ *
+ * Usage:
+ * {{{
+ *   sql("SELECT key FROM src").debug
+ * }}}
+ */
+package object debug {
+
+  /**
+   * :: DeveloperApi ::
+   * Augments SchemaRDDs with debug methods.
+   */
+  @DeveloperApi
+  implicit class DebugQuery(query: SchemaRDD) {
+    def debug(implicit sc: SparkContext): Unit = {
+      val plan = query.queryExecution.executedPlan
+      val visited = new collection.mutable.HashSet[Long]()
+      val debugPlan = plan transform {
+        case s: SparkPlan if !visited.contains(s.id) =>
+          visited += s.id
+          DebugNode(sc, s)
+      }
+      println(s"Results returned: ${debugPlan.execute().count()}")
+      debugPlan.foreach {
+        case d: DebugNode => d.dumpStats()
+        case _ =>
+      }
+    }
+  }
+
+  private[sql] case class DebugNode(
+      @transient sparkContext: SparkContext,
+      child: SparkPlan) extends UnaryNode {
+    def references = Set.empty
+
+    def output = child.output
+
+    implicit object SetAccumulatorParam extends AccumulatorParam[HashSet[String]] {
+      def zero(initialValue: HashSet[String]): HashSet[String] = {
+        initialValue.clear()
+        initialValue
+      }
+
+      def addInPlace(v1: HashSet[String], v2: HashSet[String]): HashSet[String] = {
+        v1 ++= v2
+        v1
+      }
+    }
+
+    /**
+     * A collection of stats for each column of output.
+     * @param elementTypes the actual runtime types for the output.  Useful when there are bugs
+     *        causing the wrong data to be projected.
+     */
+    case class ColumnStat(
+        elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty))
+    val tupleCount = sparkContext.accumulator[Int](0)
+
+    val numColumns = child.output.size
+    val columnStats = Array.fill(child.output.size)(new ColumnStat())
+
+    def dumpStats(): Unit = {
+      println(s"== ${child.simpleString} ==")
+      println(s"Tuples output: ${tupleCount.value}")
+      child.output.zip(columnStats).foreach { case(attr, stat) =>
+        val actualDataTypes =stat.elementTypes.value.mkString("{", ",", "}")
+        println(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
+      }
+    }
+
+    def execute() = {
+      child.execute().mapPartitions { iter =>
+        new Iterator[Row] {
+          def hasNext = iter.hasNext
+          def next() = {
+            val currentRow = iter.next()
+            tupleCount += 1
+            var i = 0
+            while (i < numColumns) {
+              val value = currentRow(i)
+              columnStats(i).elementTypes += HashSet(value.getClass.getName)
+              i += 1
+            }
+            currentRow
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -218,6 +218,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     val hiveContext = self
 
     override val strategies: Seq[Strategy] = Seq(
+      CommandStrategy(self),
       TakeOrdered,
       ParquetOperations,
       HiveTableScans,
@@ -304,7 +305,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
      */
     def stringResult(): Seq[String] = analyzed match {
       case NativeCommand(cmd) => runSqlHive(cmd)
-      case ExplainCommand(plan) => new QueryExecution { val logical = plan }.toString.split("\n")
+      case ExplainCommand(plan) => mkQueryExecution(plan).toString.split("\n")
       case query =>
         val result: Seq[Seq[Any]] = toRdd.collect().toSeq
         // We need the types so we can output struct field names
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive
 
 /**
  * A set of test cases expressed in Hive QL that are not covered by the tests included in the hive distribution.
@@ -159,4 +160,15 @@ class HiveQuerySuite extends HiveComparisonTest {
     hql("SHOW TABLES").toString
     hql("SELECT * FROM src").toString
   }
+
+  test("SPARK-1704: Explain commands as a SchemaRDD") {
+    hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+    val rdd = hql("explain select key, count(value) from src group by key")
+    assert(rdd.collect().size == 1)
+    assert(rdd.toString.contains("ExplainCommand"))
+    assert(rdd.filter(row => row.toString.contains("ExplainCommand")).collect().size == 0,
+      "actual contents of the result should be the plans of the query to be explained")
+    TestHive.reset()
+  }
+
 }
diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
diff --git a/yarn/common/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala b/yarn/common/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -233,4 +233,15 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {`
`233`	`233`	`case _ => Nil`
`234`	`234`	`}`
`235`	`235`	`}`
	`236`	`+`
	`237`	`+ // TODO: this should be merged with SPARK-1508's SetCommandStrategy`
	`238`	`+ case class CommandStrategy(context: SQLContext) extends Strategy {`
	`239`	`+ def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {`
	`240`	`+ case logical.ExplainCommand(child) =>`
	`241`	`+ val qe = context.mkQueryExecution(child)`
	`242`	`+ Seq(execution.ExplainCommandPhysical(qe.executedPlan, plan.output)(context))`
	`243`	`+ case _ => Nil`
	`244`	`+ }`
	`245`	`+ }`
	`246`	`+`
`236`	`247`	`}`