Add ShowCreateTableAsSparkCommand.

viirya · viirya · commit 77953558117b · 2019-06-23T22:07:02.000+08:00
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -183,7 +183,7 @@ statement
     | SHOW PARTITIONS tableIdentifier partitionSpec?                   #showPartitions
     | SHOW identifier? FUNCTIONS
         (LIKE? (qualifiedName | pattern=STRING))?                      #showFunctions
-    | SHOW CREATE TABLE tableIdentifier                                #showCreateTable
+    | SHOW CREATE TABLE tableIdentifier (AS SPARK)?                    #showCreateTable
     | (DESC | DESCRIBE) FUNCTION EXTENDED? describeFuncName            #describeFunction
     | (DESC | DESCRIBE) database EXTENDED? identifier                  #describeDatabase
     | (DESC | DESCRIBE) TABLE? option=(EXTENDED | FORMATTED)?
@@ -1027,6 +1027,7 @@ ansiNonReserved
     | SKEWED
     | SORT
     | SORTED
+    | SPARK
     | START
     | STATISTICS
     | STORED
@@ -1283,6 +1284,7 @@ nonReserved
     | SOME
     | SORT
     | SORTED
+    | SPARK
     | START
     | STATISTICS
     | STORED
@@ -1541,6 +1543,7 @@ SKEWED: 'SKEWED';
 SOME: 'SOME';
 SORT: 'SORT';
 SORTED: 'SORTED';
+SPARK: 'SPARK';
 START: 'START';
 STATISTICS: 'STATISTICS';
 STORED: 'STORED';
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -235,11 +235,15 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
   }
 
   /**
-   * Creates a [[ShowCreateTableCommand]]
+   * Creates a [[ShowCreateTableCommand]] or [[ShowCreateTableAsSparkCommand]]
    */
   override def visitShowCreateTable(ctx: ShowCreateTableContext): LogicalPlan = withOrigin(ctx) {
     val table = visitTableIdentifier(ctx.tableIdentifier())
-    ShowCreateTableCommand(table)
+    if (ctx.SPARK != null) {
+      ShowCreateTableAsSparkCommand(table)
+    } else {
+      ShowCreateTableCommand(table)
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.sql.execution.command
 
-import java.io.File
 import java.net.{URI, URISyntaxException}
-import java.nio.file.FileSystems
 
 import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
@@ -29,7 +27,7 @@ import org.apache.hadoop.fs.{FileContext, FsConstants, Path}
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, UnresolvedAttribute, UnresolvedRelation}
+import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
@@ -44,7 +42,7 @@ import org.apache.spark.sql.execution.datasources.v2.csv.CSVDataSourceV2
 import org.apache.spark.sql.execution.datasources.v2.json.JsonDataSourceV2
 import org.apache.spark.sql.execution.datasources.v2.orc.OrcDataSourceV2
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetDataSourceV2
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.SchemaUtils
 
@@ -942,7 +940,95 @@ case class ShowPartitionsCommand(
   }
 }
 
-case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableCommand {
+/**
+ * Provides common utilities between `ShowCreateTableCommand` and `ShowCreateTableAsSparkCommand`.
+ */
+trait ShowCreateTableCommandBase {
+
+  protected val table: TableIdentifier
+
+  protected def showTableLocation(metadata: CatalogTable, builder: StringBuilder): Unit = {
+    if (metadata.tableType == EXTERNAL) {
+      metadata.storage.locationUri.foreach { location =>
+        builder ++= s"LOCATION '${escapeSingleQuotedString(CatalogUtils.URIToString(location))}'\n"
+      }
+    }
+  }
+
+  protected def showTableComment(metadata: CatalogTable, builder: StringBuilder): Unit = {
+    metadata
+      .comment
+      .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n")
+      .foreach(builder.append)
+  }
+
+  protected def showTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = {
+    if (metadata.properties.nonEmpty) {
+      val props = metadata.properties.map { case (key, value) =>
+        s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'"
+      }
+
+      builder ++= props.mkString("TBLPROPERTIES (\n  ", ",\n  ", "\n)\n")
+    }
+  }
+
+  protected def showDataSourceTableDataColumns(
+      metadata: CatalogTable, builder: StringBuilder): Unit = {
+    val columns = metadata.schema.fields.map(_.toDDL)
+    builder ++= columns.mkString("(", ", ", ")\n")
+  }
+
+  protected def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = {
+    builder ++= s"USING ${metadata.provider.get}\n"
+
+    val dataSourceOptions = metadata.storage.properties.map {
+      case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'"
+    }
+
+    if (dataSourceOptions.nonEmpty) {
+      builder ++= "OPTIONS (\n"
+      builder ++= dataSourceOptions.mkString("  ", ",\n  ", "\n")
+      builder ++= ")\n"
+    }
+  }
+
+  protected def showDataSourceTableNonDataColumns(
+      metadata: CatalogTable, builder: StringBuilder): Unit = {
+    val partCols = metadata.partitionColumnNames
+    if (partCols.nonEmpty) {
+      builder ++= s"PARTITIONED BY ${partCols.mkString("(", ", ", ")")}\n"
+    }
+
+    metadata.bucketSpec.foreach { spec =>
+      if (spec.bucketColumnNames.nonEmpty) {
+        builder ++= s"CLUSTERED BY ${spec.bucketColumnNames.mkString("(", ", ", ")")}\n"
+
+        if (spec.sortColumnNames.nonEmpty) {
+          builder ++= s"SORTED BY ${spec.sortColumnNames.mkString("(", ", ", ")")}\n"
+        }
+
+        builder ++= s"INTO ${spec.numBuckets} BUCKETS\n"
+      }
+    }
+  }
+
+  protected def showCreateDataSourceTable(metadata: CatalogTable): String = {
+    val builder = StringBuilder.newBuilder
+
+    builder ++= s"CREATE TABLE ${table.quotedString} "
+    showDataSourceTableDataColumns(metadata, builder)
+    showDataSourceTableOptions(metadata, builder)
+    showDataSourceTableNonDataColumns(metadata, builder)
+    showTableComment(metadata, builder)
+    showTableLocation(metadata, builder)
+    showTableProperties(metadata, builder)
+
+    builder.toString()
+  }
+}
+
+case class ShowCreateTableCommand(table: TableIdentifier)
+    extends RunnableCommand with ShowCreateTableCommandBase {
   override val output: Seq[Attribute] = Seq(
     AttributeReference("createtab_stmt", StringType, nullable = false)()
   )
@@ -1057,83 +1143,79 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
       }
     }
   }
+}
 
-  private def showTableLocation(metadata: CatalogTable, builder: StringBuilder): Unit = {
-    if (metadata.tableType == EXTERNAL) {
-      metadata.storage.locationUri.foreach { location =>
-        builder ++= s"LOCATION '${escapeSingleQuotedString(CatalogUtils.URIToString(location))}'\n"
-      }
-    }
-  }
+/**
+ * This commands generates Spark DDL for Hive table.
+ *
+ * The syntax of using this command in SQL is:
+ * {{{
+ *   SHOW CREATE TABLE table_identifier AS SPARK;
+ * }}}
+ */
+case class ShowCreateTableAsSparkCommand(table: TableIdentifier)
+    extends RunnableCommand with ShowCreateTableCommandBase {
+  override val output: Seq[Attribute] = Seq(
+    AttributeReference("sparktab_stmt", StringType, nullable = false)()
+  )
 
-  private def showTableComment(metadata: CatalogTable, builder: StringBuilder): Unit = {
-    metadata
-      .comment
-      .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n")
-      .foreach(builder.append)
-  }
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    val catalog = sparkSession.sessionState.catalog
+    val tableMetadata = catalog.getTableMetadata(table)
 
-  private def showTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = {
-    if (metadata.properties.nonEmpty) {
-      val props = metadata.properties.map { case (key, value) =>
-        s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'"
+    val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) {
+      throw new AnalysisException(
+        s"$table is already a Spark data source table. Using `SHOW CREATE TABLE` instead.")
+    } else {
+      if (tableMetadata.unsupportedFeatures.nonEmpty) {
+        throw new AnalysisException(
+          "Failed to execute SHOW CREATE TABLE AS SPARK against table " +
+            s"${tableMetadata.identifier}, which is created by Hive and uses the " +
+            "following unsupported feature(s)\n" +
+            tableMetadata.unsupportedFeatures.map(" - " + _).mkString("\n")
+        )
       }
 
-      builder ++= props.mkString("TBLPROPERTIES (\n  ", ",\n  ", "\n)\n")
-    }
-  }
-
-  private def showCreateDataSourceTable(metadata: CatalogTable): String = {
-    val builder = StringBuilder.newBuilder
-
-    builder ++= s"CREATE TABLE ${table.quotedString} "
-    showDataSourceTableDataColumns(metadata, builder)
-    showDataSourceTableOptions(metadata, builder)
-    showDataSourceTableNonDataColumns(metadata, builder)
-    showTableComment(metadata, builder)
-    showTableLocation(metadata, builder)
-    showTableProperties(metadata, builder)
-
-    builder.toString()
-  }
-
-  private def showDataSourceTableDataColumns(
-      metadata: CatalogTable, builder: StringBuilder): Unit = {
-    val columns = metadata.schema.fields.map(_.toDDL)
-    builder ++= columns.mkString("(", ", ", ")\n")
-  }
-
-  private def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = {
-    builder ++= s"USING ${metadata.provider.get}\n"
+      if (tableMetadata.tableType == VIEW) {
+        throw new AnalysisException("Hive view isn't supported by SHOW CREATE TABLE AS SPARK")
+      }
 
-    val dataSourceOptions = metadata.storage.properties.map {
-      case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'"
+      showCreateDataSourceTable(convertTableMetadata(tableMetadata))
     }
 
-    if (dataSourceOptions.nonEmpty) {
-      builder ++= "OPTIONS (\n"
-      builder ++= dataSourceOptions.mkString("  ", ",\n  ", "\n")
-      builder ++= ")\n"
-    }
+    Seq(Row(stmt))
   }
 
-  private def showDataSourceTableNonDataColumns(
-      metadata: CatalogTable, builder: StringBuilder): Unit = {
-    val partCols = metadata.partitionColumnNames
-    if (partCols.nonEmpty) {
-      builder ++= s"PARTITIONED BY ${partCols.mkString("(", ", ", ")")}\n"
-    }
-
-    metadata.bucketSpec.foreach { spec =>
-      if (spec.bucketColumnNames.nonEmpty) {
-        builder ++= s"CLUSTERED BY ${spec.bucketColumnNames.mkString("(", ", ", ")")}\n"
-
-        if (spec.sortColumnNames.nonEmpty) {
-          builder ++= s"SORTED BY ${spec.sortColumnNames.mkString("(", ", ", ")")}\n"
-        }
-
-        builder ++= s"INTO ${spec.numBuckets} BUCKETS\n"
+  private def convertTableMetadata(tableMetadata: CatalogTable): CatalogTable = {
+    val hiveSerde = HiveSerDe(
+      serde = tableMetadata.storage.serde,
+      inputFormat = tableMetadata.storage.inputFormat,
+      outputFormat = tableMetadata.storage.outputFormat)
+
+    // Looking for Spark data source that maps to to the Hive serde.
+    // TODO: some Hive fileformat + row serde might be mapped to Spark data source, e.g. CSV.
+    val source = HiveSerDe.serdeToSource(hiveSerde)
+    if (source.isEmpty) {
+      val builder = StringBuilder.newBuilder
+      hiveSerde.serde.foreach { serde =>
+        builder ++= s" SERDE: $serde"
+      }
+      hiveSerde.inputFormat.foreach { format =>
+        builder ++= s" INPUTFORMAT: $format"
       }
+      hiveSerde.outputFormat.foreach { format =>
+        builder ++= s" OUTPUTFORMAT: $format"
+      }
+      throw new AnalysisException(
+        "Failed to execute SHOW CREATE TABLE AS SPARK against table " +
+          s"${tableMetadata.identifier}, which is created by Hive and uses the " +
+          "following unsupported serde configuration\n" +
+          builder.toString()
+      )
+    } else {
+      // TODO: should we keep Hive serde properties?
+      val newStorage = tableMetadata.storage.copy(properties = Map.empty)
+      tableMetadata.copy(provider = source, storage = newStorage)
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
@@ -65,6 +65,14 @@ object HiveSerDe {
         outputFormat = Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"),
         serde = Option("org.apache.hadoop.hive.serde2.avro.AvroSerDe")))
 
+  // `HiveSerDe` in `serdeMap` should be dintinct.
+  val serdeInverseMap: Map[HiveSerDe, String] = serdeMap.flatMap {
+    case ("sequencefile", _) => None
+    case ("rcfile", _) => None
+    case ("textfile", serde) => Some((serde, "text"))
+    case pair => Some(pair.swap)
+  }
+
   /**
    * Get the Hive SerDe information from the data source abbreviation string or classname.
    *
@@ -88,6 +96,14 @@ object HiveSerDe {
     serdeMap.get(key)
   }
 
+  /**
+   * Get the Spark data source name from the Hive SerDe information.
+   *
+   * @param serde Hive SerDe information.
+   * @return Spark data source name associated with the specified Hive Serde.
+   */
+  def serdeToSource(serde: HiveSerDe): Option[String] = serdeInverseMap.get(serde)
+
   def getDefaultStorage(conf: SQLConf): CatalogStorageFormat = {
     // To respect hive-site.xml, it peeks Hadoop configuration from existing Spark session,
     // as an easy workaround. See SPARK-27555.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala
@@ -200,7 +200,7 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils {
     }
   }
 
-  private def checkCatalogTables(expected: CatalogTable, actual: CatalogTable): Unit = {
+  protected def checkCatalogTables(expected: CatalogTable, actual: CatalogTable): Unit = {
     def normalize(table: CatalogTable): CatalogTable = {
       val nondeterministicProps = Set(
         "CreateTime",
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,7 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils {`
`200`	`200`	`}`
`201`	`201`	`}`
`202`	`202`
`203`		`- private def checkCatalogTables(expected: CatalogTable, actual: CatalogTable): Unit = {`
	`203`	`+ protected def checkCatalogTables(expected: CatalogTable, actual: CatalogTable): Unit = {`
`204`	`204`	`def normalize(table: CatalogTable): CatalogTable = {`
`205`	`205`	`val nondeterministicProps = Set(`
`206`	`206`	`"CreateTime",`