DescribeTable based on CatalogTable

dilipbiswal · dilipbiswal · commit 319d45ba2a46 · 2016-04-29T20:42:03.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -320,23 +320,6 @@ class SessionCatalog(
     alias.map(a => SubqueryAlias(a, qualifiedTable)).getOrElse(qualifiedTable)
   }
 
-  /**
-   * Describes a table by returning various metadata pertaining to table/partitions/columns.
-   */
-  def describeTable(
-      table: TableIdentifier,
-      partSpec: Option[TablePartitionSpec],
-      colPath: Option[String],
-      isExtended: Boolean,
-      output: Seq[Attribute]): Seq[(String, String, String)] = {
-    val relation = lookupRelation(table)
-    relation.schema.fields.map { field =>
-      val cmtKey = "comment"
-      val comment = if (field.metadata.contains(cmtKey)) field.metadata.getString(cmtKey) else ""
-      (field.name, field.dataType.simpleString, comment)
-    }
-  }
-
   /**
    * Return whether a table with the specified name exists.
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -110,6 +110,14 @@ case class CatalogTable(
   def partitionColumns: Seq[CatalogColumn] =
     schema.filter { c => partitionColumnNames.contains(c.name) }
 
+   /** Columns this table is bucketed  by. */
+  def bucketColumns: Seq[CatalogColumn] =
+    schema.filter { c => bucketColumnNames.contains(c.name) }
+
+   /** Columns this table is sorted by. */
+  def sortColumns: Seq[CatalogColumn] =
+    schema.filter { c => sortColumnNames.contains(c.name) }
+
   /** Return the database this table was specified to belong to, assuming it exists. */
   def database: String = identifier.database.getOrElse {
     throw new AnalysisException(s"table $identifier did not specify database")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -20,15 +20,17 @@ package org.apache.spark.sql.execution.command
 import java.io.File
 import java.net.URI
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, ExternalCatalog}
-import org.apache.spark.sql.catalyst.catalog.ExternalCatalog.TablePartitionSpec
-import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan, UnaryNode}
-import org.apache.spark.sql.types.{BooleanType, MetadataBuilder, StringType}
+import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 case class CreateTableAsSelectLogicalPlan(
@@ -274,6 +276,7 @@ case class LoadData(
  * {{{
  *   DESCRIBE [EXTENDED|FORMATTED] [db_name.]table_name [column_name] [PARTITION partition_spec]
  * }}}
+ * Note : FORMATTED option is not supported.
  * @param table table to be described.
  * @param partSpec spec If specified, the specified partition is described. It is effective only
  *                 when the table is a Hive table
@@ -289,7 +292,7 @@ case class DescribeTableCommand(
     isExtended: Boolean)
   extends RunnableCommand {
 
-   override val output: Seq[Attribute] = Seq(
+  override val output: Seq[Attribute] = Seq(
     // Column names are based on Hive.
     AttributeReference("col_name", StringType, nullable = false,
       new MetadataBuilder().putString("comment", "name of the column").build())(),
@@ -299,28 +302,161 @@ case class DescribeTableCommand(
       new MetadataBuilder().putString("comment", "comment of the column").build())()
   )
 
-  override def run(sparkSession: SparkSession): Seq[Row] = {
-    val catalog = sparkSession.sessionState.catalog
-    // Check to make sure supplied partition are valid partition columns. .
-    if (partSpec.isDefined && !catalog.isTemporaryTable(table)) {
-      val tab = catalog.getTableMetadata(table)
-      val badColumns = partSpec.get.keySet.filterNot(tab.partitionColumns.map(_.name).contains)
-      if (badColumns.nonEmpty) {
-        throw new AnalysisException(
-          s"Non-partitioned column(s) [${badColumns.mkString(", ")}] are " +
-            s"specified for DESCRIBE command")
+  private def formatColumns(cols: Seq[CatalogColumn]): String = {
+    cols.map { col =>
+      s"""
+         |${col.getClass.getSimpleName}
+         |(name:${col.name}
+         |type:${col.dataType}
+         |comment:${col.comment.orNull}
+       """.stripMargin
+    }.mkString(",")
+  }
+
+  private def formatProperties(props: Map[String, String]): String = {
+    props.map {
+      case (k, v) => s"$k=$v"
+    }.mkString("{", ", ", "}")
+  }
+
+  private def getPartValues(part: CatalogTablePartition, cols: Seq[String]): String = {
+    cols.map { name =>
+      PartitioningUtils.escapePathName(part.spec(name))
+    }.mkString(", ")
+  }
+
+  private def descColPath(table: CatalogTable, colPath: String): Array[Row] = {
+    val names = colPath.split("\\.");
+    val lastName = names(names.length - 1)
+    val fields = table.schema.map {c =>
+      StructField(c.name, CatalystSqlParser.parseDataType(c.dataType), c.nullable)
+    }
+    var dataType: DataType = StructType(fields)
+    for (i <- 0 to names.length -1) {
+      dataType match {
+        case s: StructType =>
+          try {
+            dataType = s.apply(names(i)).dataType
+          } catch {
+            case e: Exception =>
+              throw new AnalysisException(s"Column name/path: ${colPath} does not exist.")
+          }
+        case m: MapType if names(i) == "$key$" => dataType = m.keyType
+        case m: MapType if names(i) == "$value$" => dataType = m.valueType
+        case a: ArrayType if names(i) == "$value$" => dataType = a.elementType
+        case _ => throw new AnalysisException("Column name/path: ${colPath} does not exist")
       }
     }
 
-    val results =
-      sparkSession.sessionState.catalog.describeTable(table, partSpec, colPath, isExtended, output)
-    val rows = results.map { case (name, dataType, comment) =>
-      Row(name, dataType, comment)
+    val result: Seq[Row] = dataType match {
+      case s: StructType =>
+        s.map { f =>
+          Row(f.name, f.dataType.simpleString, "from deserializer")}
+      case d: DataType => Seq(Row(lastName, dataType.simpleString, "from deserializer"))
     }
-    rows
+    result.toArray
+  }
+
+  private def descStorageFormat(
+      table: CatalogTable,
+      storage: CatalogStorageFormat): String = {
+    // TODO - check with Lian - from StorageDesc - compress, skewedInfo, storedAsSubDirectories
+    // are not availble. So these are dropped from the output.
+    val storageLocationStr =
+      s"""
+         |${storage.getClass.getSimpleName}(location:${storage.locationUri.orNull},
+         | inputFormat:${storage.inputFormat.orNull},
+         | outputFormat:${storage.outputFormat.orNull},
+         | numBuckets:${table.numBuckets},
+         | serializationLib=${storage.serde.orNull},
+         | parameters=${formatProperties(storage.serdeProperties)},
+         | bucketCols:[${formatColumns(table.bucketColumns)}],
+         | sortCols=[${formatColumns(table.sortColumns)}])
+       """.stripMargin.replaceAll("\n", "").trim
+    storageLocationStr
+  }
+
+  private def descPartExtended(table: CatalogTable, part: CatalogTablePartition): String = {
+    val result = StringBuilder.newBuilder
+    val clsName = part.getClass.getSimpleName
+    result ++= s"${clsName}(values:[${getPartValues(part, table.partitionColumnNames)}], "
+    result ++= s"dbName:${table.database}, "
+    // TODO - check with Lian - no owner info available.
+    result ++= s"createTime:${table.createTime}, "
+    result ++= s"lastAccessTime:${table.lastAccessTime}, "
+    // TODO - check with Lian - no retention info available.
+
+    result ++= s"sd:${descStorageFormat(table, part.storage)}, "
+    // TODO Check with Lian - Hive prints partition keys here. Since we output paritioning keys and
+    // schema already at the start i don't output it here again.
+    result ++= s"parameters:${formatProperties(table.properties)}, "
+    result ++= s"viewOriginalText:${table.viewOriginalText.orNull}, "
+    result ++= s"viewExpandedText:${table.viewText.orNull}, "
+    result ++= s"tableType:${table.tableType})"
+    result.toString
   }
-}
 
+  private def descTableExtended(table: CatalogTable): String = {
+    val result = StringBuilder.newBuilder
+    result ++= s"${table.getClass.getSimpleName}(tableName:${table.identifier.table}, "
+    result ++= s"dbName:${table.database}, "
+    // TODO - check with Lian - no owner info available.
+    result ++= s"createTime:${table.createTime}, "
+    result ++= s"lastAccessTime:${table.lastAccessTime}, "
+    // TODO - check with Lian - no retention info available.
+
+    result ++= s"sd:${descStorageFormat(table, table.storage)}, "
+    // TODO Check with Lian - Hive prints partition keys here. Since we output paritioning keys
+    // and schema already i don't output it here again.
+    result ++= s"parameters:${formatProperties(table.properties)}, "
+    result ++= s"viewOriginalText:${table.viewOriginalText.orNull}, "
+    result ++= s"viewExpandedText:${table.viewText.orNull}, "
+    result ++= s"tableType:${table.tableType})"
+    result.toString
+  }
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    val result = new ArrayBuffer[Row]
+    val catalog = sparkSession.sessionState.catalog
+    catalog.lookupRelation(table) match {
+      case catalogRelation: CatalogRelation =>
+        val tab = catalogRelation.catalogTable
+        val part = partSpec.map(p => Option(catalog.getPartition(table, p))).getOrElse(None)
+        if (colPath.nonEmpty) {
+           result ++= descColPath(tab, colPath.get)
+        } else {
+          catalogRelation.catalogTable.schema.foreach { column =>
+            result += Row(column.name, column.dataType, column.comment.orNull)
+          }
+          if (tab.partitionColumns.nonEmpty) {
+            result += Row("# Partition Information", "", "")
+            result += Row(s"# ${output(0).name}", output(1).name, output(2).name)
+
+            tab.partitionColumns.foreach { col =>
+              result += Row(col.name, col.dataType, col.comment.orNull)
+            }
+          }
+          if (isExtended) {
+            if (partSpec.isEmpty) {
+              result += Row("Detailed Table Information", descTableExtended(tab), "")
+            } else {
+              result +=
+                Row("Detailed Partition Information", descPartExtended(tab, part.get), "")
+            }
+          }
+        }
+
+      case relation =>
+        relation.schema.fields.foreach { field =>
+          val comment =
+            if (field.metadata.contains("comment")) field.metadata.getString("comment") else ""
+          result += Row(field.name, field.dataType.simpleString, comment)
+        }
+    }
+
+    result
+  }
+}
 
 /**
  * A command for users to get tables in the given database.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -29,8 +29,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
 import org.apache.spark.sql.catalyst.catalog.{FunctionResourceLoader, SessionCatalog}
-import org.apache.spark.sql.catalyst.catalog.ExternalCatalog._
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, ExpressionInfo}
+import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.hive.HiveShim.HiveFunctionWrapper
@@ -67,26 +66,6 @@ private[sql] class HiveSessionCatalog(
     }
   }
 
-  /**
-   * Describes a table by returning various metadata pertaining to table/partitions/columns.
-   */
-  override def describeTable(
-      table: TableIdentifier,
-      partSpec: Option[TablePartitionSpec],
-      colPath: Option[String],
-      isExtended: Boolean,
-      output: Seq[Attribute]): Seq[(String, String, String)] = {
-    val relation = lookupRelation(table)
-    relation match {
-      case r: MetastoreRelation =>
-        val db = table.database.getOrElse(currentDb)
-        val tableName = formatTableName(table.table)
-        client.describeTable(db, tableName, partSpec, colPath, isExtended, output)
-      case o: LogicalPlan =>
-        super.describeTable(table, partSpec, colPath, isExtended, output)
-    }
-  }
-
   // ----------------------------------------------------------------
   // | Methods and fields for interacting with HiveMetastoreCatalog |
   // ----------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -478,90 +478,6 @@ private[hive] class HiveClientImpl(
     client.getTablesByPattern(dbName, pattern).asScala
   }
 
-  /**
-   * Describes a Hive table.
-   * The syntax of using this command in SQL is:
-   * {{{
-   *   DESCRIBE [EXTENDED|FORMATTED] [db_name.]table_name [column_name] [PARTITION partition_spec]
-   * }}}
-   * @param table The table to be described.
-   * @param partSpec spec If specified, the specified partition is described.
-   * @param  colPath If specified, only the specified column is described.
-   * @param isExtended True if "DESCRIBE EXTENDED" is used. Otherwise, false.
-   */
-  override def describeTable(
-      db: String,
-      table: String,
-      partSpec: Option[TablePartitionSpec],
-      colPath: Option[String],
-      isExtended: Boolean,
-      output: Seq[Attribute]): Seq[(String, String, String)] = withHiveState {
-
-    // Get partition columns or table columns based on the supplied partition spec.
-    def getCols(tab: HiveTable, part: Option[HivePartition]): Seq[FieldSchema] = {
-      if (!partSpec.isEmpty && tab.getTableType() != HiveTableType.VIRTUAL_VIEW) {
-        part.get.getCols.asScala
-      } else {
-        tab.getCols.asScala
-      }
-    }
-    // Formats the column metadata as per output schema.
-    def formatColumns(cols: Seq[FieldSchema]): Seq[(String, String, String)] = {
-      cols.map(field => (field.getName, field.getType, field.getComment))
-    }
-
-    var results: Seq[(String, String, String)] = Nil
-    // Get Table
-    val tab =
-      Option(client.getTable(db, table, false)).
-        getOrElse(throw new NoSuchTableException(db, table))
-
-    // Get Partition info
-    var part = partSpec.map(p => Option(client.getPartition(tab, p.asJava, false))).getOrElse(None)
-    if (partSpec.nonEmpty && part.isEmpty) {
-        throw new AnalysisException(
-          s"partition to describe '${partSpec.get}' does not exist" +
-            s" in table '$table' database '$db'")
-     }
-
-    // Get columns if colPath is specified.
-    val cols = colPath.map { p =>
-      val qualifiedColPath = if (!p.startsWith(table)) s"${table}.${p}" else p
-      try {
-        Hive.getFieldsFromDeserializer(qualifiedColPath, tab.getDeserializer(true)).asScala
-      } catch {
-        case e: Exception => throw new AnalysisException(e.getMessage)
-      }
-    }.getOrElse(getCols(tab, part))
-
-
-    if (colPath.isEmpty) {
-      // describe all the columns in the table first
-      results ++= formatColumns(getCols(tab, part) ++ tab.getPartCols().asScala)
-
-      // describe partition columns
-      val partitionColumns = tab.getPartCols.asScala
-      if (partitionColumns.nonEmpty) {
-        results ++=
-          Seq(("# Partition Information", "", "")) ++
-            Seq((s"# ${output(0).name}", output(1).name, output(2).name)) ++
-            formatColumns(partitionColumns)
-      }
-
-      // describe additional table/parition details
-      if (isExtended) {
-        if (partSpec.isEmpty) {
-          results ++= Seq(("Detailed Table Information", tab.getTTable.toString, ""))
-        } else {
-          results ++= Seq(("Detailed Partition Information", part.get.getTPartition.toString, ""))
-        }
-      }
-    } else {
-      results ++= formatColumns(cols)
-    }
-    results
-  }
-
   /**
    * Runs the specified SQL query using Hive.
    */
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala