apache · viirya · May 31, 2017 · May 31, 2017 · Jun 1, 2017 · Jun 2, 2017
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1002,6 +1002,15 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /**
+   * Lists files recursively.
+   */
+  def recursiveList(f: File): Array[File] = {
+    require(f.isDirectory)
+    val current = f.listFiles
+    current ++ current.filter(_.isDirectory).flatMap(recursiveList)
+  }
+
   /**
    * Delete a file or directory and its contents recursively.
    * Don't follow directories if they are symlinks.
@@ -2659,6 +2668,13 @@ private[spark] object Utils extends Logging {
     redact(redactionPattern, kvs.toArray)
   }
 
+  def average[T](ts: Iterable[T])(implicit num: Numeric[T]): Double = {
+    if (ts.size > 0) {
+      num.toDouble(ts.sum) / ts.size
+    } else {
+      0
+    }
+  }
 }
 
 private[util] object CallerContext extends Logging {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -117,7 +117,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
    * `SparkSQLDriver` for CLI applications.
    */
   def hiveResultString(): Seq[String] = executedPlan match {
-    case ExecutedCommandExec(desc: DescribeTableCommand, _) =>
+    case ExecutedCommandExec(desc: DescribeTableCommand) =>
       // If it is a describe command for a Hive table, we want to have the output format
       // be similar with Hive.
       desc.run(sparkSession).map {
@@ -128,7 +128,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
             .mkString("\t")
       }
     // SHOW TABLES in Hive only output table names, while ours output database, table name, isTemp.
-    case command @ ExecutedCommandExec(s: ShowTablesCommand, _) if !s.isExtended =>
+    case command @ ExecutedCommandExec(s: ShowTablesCommand) if !s.isExtended =>
       command.executeCollect().map(_.getString(1))
     case other =>
       val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toSeq

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -346,7 +346,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   // Can we automate these 'pass through' operations?
   object BasicOperators extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case r: RunnableCommand => ExecutedCommandExec(r, r.children.map(planLater)) :: Nil
+      case f: FileWritingCommand => FileWritingCommandExec(f, f.children.map(planLater)) :: Nil
+
+      case r: RunnableCommand => ExecutedCommandExec(r) :: Nil
 
       case MemoryPlan(sink, output) =>
         val encoder = RowEncoder(sink.schema)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/FileWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/FileWritingCommand.scala
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
+import org.apache.spark.sql.execution.datasources.ExecutedWriteSummary
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.util.Utils
+
+/**
+ * A logical command specialized for writing data out. `FileWritingCommand`s are
+ * wrapped in `FileWritingCommandExec` during execution.
+ */
+trait FileWritingCommand extends logical.Command {
+
+  // The caller of `FileWritingCommand` can replace the metrics location by providing this external
+  // metrics structure.
+  private var _externalMetrics: Option[Map[String, SQLMetric]] = None
+  private[sql] def withExternalMetrics(map: Map[String, SQLMetric]): this.type = {
+    _externalMetrics = Option(map)
+    this
+  }
+
+  /**
+   * Those metrics will be updated once the command finishes writing data out. Those metrics will
+   * be taken by `FileWritingCommandExec` as its metrics when showing in UI.
+   */
+  def metrics(sparkContext: SparkContext): Map[String, SQLMetric] = _externalMetrics.getOrElse {
+    Map(
+      // General metrics.
+      "avgTime" -> SQLMetrics.createMetric(sparkContext, "average writing time (ms)"),
+      "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of written files"),
+      "numOutputBytes" -> SQLMetrics.createMetric(sparkContext, "bytes of written output"),
+      "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+      "numParts" -> SQLMetrics.createMetric(sparkContext, "number of dynamic part")
+    )
+  }
+
+  /**
+   * Callback function that update metrics collected from the writing operation.
+   */
+  private[sql] def postDriverMetrics(sparkContext: SparkContext, metrics: Map[String, SQLMetric])
+      (writeSummaries: Seq[ExecutedWriteSummary]): Unit = {
+    var numPartitions = 0
+    var numFiles = 0
+    var totalNumBytes: Long = 0L
+    var totalNumOutput: Long = 0L
+
+    writeSummaries.foreach { summary =>
+      numPartitions += summary.updatedPartitions.size
+      numFiles += summary.numOutputFile
+      totalNumBytes += summary.numOutputBytes
+      totalNumOutput += summary.numOutputRows
+    }
+
+    // The time for writing individual file can be zero if it's less than 1 ms. Zero values can
+    // lower actual time of writing when calculating average, so excluding them.
+    val writingTime =
+      Utils.average(writeSummaries.flatMap(_.writingTimePerFile.filter(_ > 0))).toLong
+
+    val metricsNames = metrics.keys.toSeq.sorted
+    val metricsValues = Seq(writingTime, numFiles, totalNumBytes, totalNumOutput, numPartitions)
+    metricsNames.zip(metricsValues).foreach(x => metrics(x._1).add(x._2))
+
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metricsNames.map(metrics(_)))
+  }
+
+  def run(
+    sparkSession: SparkSession,
+    children: Seq[SparkPlan],
+    metrics: Map[String, SQLMetric],
+    metricsCallback: (Seq[ExecutedWriteSummary]) => Unit): Seq[Row]
+}
+
+/**
+ * A physical operator specialized to execute the run method of a `FileWritingCommand`,
+ * save the result to prevent multiple executions, and record necessary metrics for UI.
+ */
+case class FileWritingCommandExec(
+    cmd: FileWritingCommand,
+    children: Seq[SparkPlan]) extends CommandExec {
+
+  override lazy val metrics = cmd.metrics(sqlContext.sparkContext)
+
+  protected[sql] lazy val invokeCommand: Seq[Row] =
+    cmd.run(sqlContext.sparkSession, children, metrics,
+      cmd.postDriverMetrics(sqlContext.sparkContext, metrics))
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -35,20 +35,16 @@ import org.apache.spark.sql.types._
  * wrapped in `ExecutedCommand` during execution.
  */
 trait RunnableCommand extends logical.Command {
-  def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = {
-    throw new NotImplementedError
-  }
-
-  def run(sparkSession: SparkSession): Seq[Row] = {
-    throw new NotImplementedError
-  }
+  def run(sparkSession: SparkSession): Seq[Row]
 }
 
 /**
- * A physical operator that executes the run method of a `RunnableCommand` and
+ * A physical operator that executes the run method of a `logical.Command` and
  * saves the result to prevent multiple executions.
  */
-case class ExecutedCommandExec(cmd: RunnableCommand, children: Seq[SparkPlan]) extends SparkPlan {
+trait CommandExec extends SparkPlan {
+  val cmd: logical.Command
+
   /**
    * A concrete command should override this lazy field to wrap up any side effects caused by the
    * command or any other computation that should be evaluated exactly once. The value of this field
@@ -60,14 +56,11 @@ case class ExecutedCommandExec(cmd: RunnableCommand, children: Seq[SparkPlan]) e
    */
   protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
     val converter = CatalystTypeConverters.createToCatalystConverter(schema)
-    val rows = if (children.isEmpty) {
-      cmd.run(sqlContext.sparkSession)
-    } else {
-      cmd.run(sqlContext.sparkSession, children)
-    }
-    rows.map(converter(_).asInstanceOf[InternalRow])
+    invokeCommand.map(converter(_).asInstanceOf[InternalRow])
   }
 
+  protected[sql] val invokeCommand: Seq[Row]
+
   override def innerChildren: Seq[QueryPlan[_]] = cmd.innerChildren
 
   override def output: Seq[Attribute] = cmd.output
@@ -85,6 +78,15 @@ case class ExecutedCommandExec(cmd: RunnableCommand, children: Seq[SparkPlan]) e
   }
 }
 
+/**
+ * A physical operator specialized to execute the run method of a `RunnableCommand` and
+ * save the result to prevent multiple executions.
+ */
+case class ExecutedCommandExec(cmd: RunnableCommand) extends CommandExec {
+  protected[sql] lazy val invokeCommand: Seq[Row] = cmd.run(sqlContext.sparkSession)
+  override def children: Seq[SparkPlan] = Nil
+}
+
 /**
  * An explain command for users to see how a command will be executed.
  *

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -21,10 +21,13 @@ import java.net.URI
 
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.sources.BaseRelation
 
 /**
@@ -120,11 +123,31 @@ case class CreateDataSourceTableAsSelectCommand(
     table: CatalogTable,
     mode: SaveMode,
     query: LogicalPlan)
-  extends RunnableCommand {
+  extends FileWritingCommand {
+
+  /**
+   * The code path this command writes data out depends on the type of data source:
+   *
+   * For FileFormat-based data sources, `InsertIntoHadoopFsRelationCommand` is invoked and we
+   * can update metrics.
+   *
+   * For other data sources, `CreatableRelationProvider.createRelation` will be called. We can't
+   * record metrics for that. So we will return empty metrics map.
+   */
+  override def metrics(sparkContext: SparkContext): Map[String, SQLMetric] =
+    if (classOf[FileFormat].isAssignableFrom(DataSource.lookupDataSource(table.provider.get))) {
+      super.metrics(sparkContext)
+    } else {
+      Map.empty
+    }
 
   override def innerChildren: Seq[LogicalPlan] = Seq(query)
 
-  override def run(sparkSession: SparkSession): Seq[Row] = {
+  override def run(
+      sparkSession: SparkSession,
+      children: Seq[SparkPlan],
+      metrics: Map[String, SQLMetric],
+      metricsCallback: (Seq[ExecutedWriteSummary]) => Unit): Seq[Row] = {
     assert(table.tableType != CatalogTableType.VIEW)
     assert(table.provider.isDefined)
 
@@ -146,7 +169,8 @@ case class CreateDataSourceTableAsSelectCommand(
       }
 
       saveDataIntoTable(
-        sparkSession, table, table.storage.locationUri, query, SaveMode.Append, tableExists = true)
+        sparkSession, table, table.storage.locationUri, query, SaveMode.Append, tableExists = true,
+        metrics = metrics)
     } else {
       assert(table.schema.isEmpty)
 
@@ -156,7 +180,8 @@ case class CreateDataSourceTableAsSelectCommand(
         table.storage.locationUri
       }
       val result = saveDataIntoTable(
-        sparkSession, table, tableLocation, query, SaveMode.Overwrite, tableExists = false)
+        sparkSession, table, tableLocation, query, SaveMode.Overwrite, tableExists = false,
+        metrics = metrics)
       val newTable = table.copy(
         storage = table.storage.copy(locationUri = tableLocation),
         // We will use the schema of resolved.relation as the schema of the table (instead of
@@ -183,7 +208,8 @@ case class CreateDataSourceTableAsSelectCommand(
       tableLocation: Option[URI],
       data: LogicalPlan,
       mode: SaveMode,
-      tableExists: Boolean): BaseRelation = {
+      tableExists: Boolean,
+      metrics: Map[String, SQLMetric]): BaseRelation = {
     // Create the relation based on the input logical plan: `data`.
     val pathOption = tableLocation.map("path" -> CatalogUtils.URIToString(_))
     val dataSource = DataSource(
@@ -195,7 +221,7 @@ case class CreateDataSourceTableAsSelectCommand(
       catalogTable = if (tableExists) Some(table) else None)
 
     try {
-      dataSource.writeAndRead(mode, query)
+      dataSource.writeAndRead(mode, query, Some(metrics))
     } catch {
       case ex: AnalysisException =>
         logError(s"Failed to write to table ${table.identifier.unquotedString}", ex)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -32,10 +32,12 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogUtils}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.execution.command.FileWritingCommandExec
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
@@ -436,7 +438,10 @@ case class DataSource(
    * Writes the given [[LogicalPlan]] out to this [[DataSource]] and returns a [[BaseRelation]] for
    * the following reading.
    */
-  def writeAndRead(mode: SaveMode, data: LogicalPlan): BaseRelation = {
+  def writeAndRead(
+      mode: SaveMode,
+      data: LogicalPlan,
+      externalMetrics: Option[Map[String, SQLMetric]] = None): BaseRelation = {
     if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
       throw new AnalysisException("Cannot save interval data type into external storage.")
     }
@@ -446,7 +451,12 @@ case class DataSource(
         dataSource.createRelation(
           sparkSession.sqlContext, mode, caseInsensitiveOptions, Dataset.ofRows(sparkSession, data))
       case format: FileFormat =>
-        sparkSession.sessionState.executePlan(planForWritingFileFormat(format, mode, data)).toRdd
+        val qe = sparkSession.sessionState.executePlan(planForWritingFileFormat(format, mode, data))
+        qe.executedPlan.transform {
+          case f: FileWritingCommandExec =>
+            val newCmd = f.cmd.withExternalMetrics(externalMetrics.getOrElse(null))
+            FileWritingCommandExec(newCmd, f.children)
+        }.execute()
         // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring
         copy(userSpecifiedSchema = Some(data.schema.asNullable)).resolveRelation()
       case _ =>