-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-19265][SQL] make table relation cache general and does not depend on hive #16621
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,13 +21,13 @@ import javax.annotation.concurrent.GuardedBy | |
|
|
||
| import scala.collection.mutable | ||
|
|
||
| import com.google.common.cache.{Cache, CacheBuilder} | ||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.Path | ||
|
|
||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst.{CatalystConf, SimpleCatalystConf} | ||
| import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} | ||
| import org.apache.spark.sql.catalyst._ | ||
| import org.apache.spark.sql.catalyst.analysis._ | ||
| import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder | ||
| import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo} | ||
|
|
@@ -117,6 +117,14 @@ class SessionCatalog( | |
| if (conf.caseSensitiveAnalysis) name else name.toLowerCase | ||
| } | ||
|
|
||
| /** | ||
| * A cache of qualified table name to table relation plan. | ||
| */ | ||
| val tableRelationCache: Cache[QualifiedTableName, LogicalPlan] = { | ||
| // TODO: create a config instead of hardcode 1000 here. | ||
| CacheBuilder.newBuilder().maximumSize(1000).build[QualifiedTableName, LogicalPlan]() | ||
| } | ||
|
|
||
| /** | ||
| * This method is used to make the given path qualified before we | ||
| * store this path in the underlying external catalog. So, when a path | ||
|
|
@@ -573,7 +581,7 @@ class SessionCatalog( | |
| val relationAlias = alias.getOrElse(table) | ||
| if (db == globalTempViewManager.database) { | ||
| globalTempViewManager.get(table).map { viewDef => | ||
| SubqueryAlias(relationAlias, viewDef, Some(name)) | ||
| SubqueryAlias(relationAlias, viewDef, None) | ||
| }.getOrElse(throw new NoSuchTableException(db, table)) | ||
| } else if (name.database.isDefined || !tempTables.contains(table)) { | ||
| val metadata = externalCatalog.getTable(db, table) | ||
|
|
@@ -586,12 +594,12 @@ class SessionCatalog( | |
| desc = metadata, | ||
| output = metadata.schema.toAttributes, | ||
| child = parser.parsePlan(viewText)) | ||
| SubqueryAlias(relationAlias, child, Option(name)) | ||
| SubqueryAlias(relationAlias, child, Some(name.copy(table = table, database = Some(db)))) | ||
| } else { | ||
| SubqueryAlias(relationAlias, SimpleCatalogRelation(metadata), None) | ||
| } | ||
| } else { | ||
| SubqueryAlias(relationAlias, tempTables(table), Option(name)) | ||
| SubqueryAlias(relationAlias, tempTables(table), None) | ||
|
||
| } | ||
| } | ||
| } | ||
|
|
@@ -651,14 +659,21 @@ class SessionCatalog( | |
| * Refresh the cache entry for a metastore table, if any. | ||
| */ | ||
| def refreshTable(name: TableIdentifier): Unit = synchronized { | ||
| val dbName = formatDatabaseName(name.database.getOrElse(currentDb)) | ||
| val tableName = formatTableName(name.table) | ||
|
|
||
| // Go through temporary tables and invalidate them. | ||
| // If the database is defined, this is definitely not a temp table. | ||
| // If the database is defined, this may be a global temporary view. | ||
| // If the database is not defined, there is a good chance this is a temp table. | ||
| if (name.database.isEmpty) { | ||
| tempTables.get(formatTableName(name.table)).foreach(_.refresh()) | ||
| } else if (formatDatabaseName(name.database.get) == globalTempViewManager.database) { | ||
| globalTempViewManager.get(formatTableName(name.table)).foreach(_.refresh()) | ||
| tempTables.get(tableName).foreach(_.refresh()) | ||
| } else if (dbName == globalTempViewManager.database) { | ||
| globalTempViewManager.get(tableName).foreach(_.refresh()) | ||
| } | ||
|
|
||
| // Also invalidate the table relation cache. | ||
|
||
| val qualifiedTableName = QualifiedTableName(dbName, tableName) | ||
| tableRelationCache.invalidate(qualifiedTableName) | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,9 +60,11 @@ case class TableIdentifier(table: String, database: Option[String]) | |
| override val identifier: String = table | ||
|
|
||
| def this(table: String) = this(table, None) | ||
|
|
||
| } | ||
|
|
||
| /** A fully qualified identifier for a table (i.e., database.tableName) */ | ||
| case class QualifiedTableName(database: String, name: String) | ||
|
||
|
|
||
| object TableIdentifier { | ||
| def apply(tableName: String): TableIdentifier = new TableIdentifier(tableName) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -386,7 +386,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { | |
| case relation: CatalogRelation if DDLUtils.isHiveTable(relation.catalogTable) => | ||
| relation.catalogTable.identifier | ||
| } | ||
| EliminateSubqueryAliases(catalog.lookupRelation(tableIdentWithDB)) match { | ||
|
|
||
| val tableRelation = df.sparkSession.table(tableIdentWithDB).queryExecution.analyzed | ||
|
||
| EliminateSubqueryAliases(tableRelation) match { | ||
| // check if the table is a data source table (the relation is a BaseRelation). | ||
| case LogicalRelation(dest: BaseRelation, _, _) if srcRelations.contains(dest) => | ||
| throw new AnalysisException( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,18 +17,17 @@ | |
|
|
||
| package org.apache.spark.sql.execution.datasources | ||
|
|
||
| import scala.collection.mutable.ArrayBuffer | ||
| import java.util.concurrent.Callable | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql._ | ||
| import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, InternalRow} | ||
| import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, InternalRow, QualifiedTableName, TableIdentifier} | ||
| import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala | ||
| import org.apache.spark.sql.catalyst.analysis._ | ||
| import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTablePartition, SimpleCatalogRelation} | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec | ||
| import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SimpleCatalogRelation} | ||
| import org.apache.spark.sql.catalyst.expressions | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.planning.PhysicalOperation | ||
|
|
@@ -37,6 +36,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPa | |
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan} | ||
| import org.apache.spark.sql.execution.command._ | ||
| import org.apache.spark.sql.internal.StaticSQLConf | ||
| import org.apache.spark.sql.sources._ | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
@@ -215,37 +215,43 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] { | |
|
|
||
|
|
||
| /** | ||
| * Replaces [[SimpleCatalogRelation]] with data source table if its table property contains data | ||
| * source information. | ||
| * Replaces [[SimpleCatalogRelation]] with data source table if its table provider is not hive. | ||
| */ | ||
| class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan] { | ||
| private def readDataSourceTable( | ||
| sparkSession: SparkSession, | ||
| simpleCatalogRelation: SimpleCatalogRelation): LogicalPlan = { | ||
| val table = simpleCatalogRelation.catalogTable | ||
| val pathOption = table.storage.locationUri.map("path" -> _) | ||
| val dataSource = | ||
| DataSource( | ||
| sparkSession, | ||
| userSpecifiedSchema = Some(table.schema), | ||
| partitionColumns = table.partitionColumnNames, | ||
| bucketSpec = table.bucketSpec, | ||
| className = table.provider.get, | ||
| options = table.storage.properties ++ pathOption) | ||
|
|
||
| LogicalRelation( | ||
| dataSource.resolveRelation(), | ||
| expectedOutputAttributes = Some(simpleCatalogRelation.output), | ||
| catalogTable = Some(table)) | ||
| private def readDataSourceTable(table: CatalogTable): LogicalPlan = { | ||
| val qualifiedTableName = QualifiedTableName(table.database, table.identifier.table) | ||
| val cache = sparkSession.sessionState.catalog.tableRelationCache | ||
| val withHiveSupport = | ||
| sparkSession.sparkContext.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive" | ||
|
|
||
| cache.get(qualifiedTableName, new Callable[LogicalPlan]() { | ||
| override def call(): LogicalPlan = { | ||
| val pathOption = table.storage.locationUri.map("path" -> _) | ||
| val dataSource = | ||
| DataSource( | ||
| sparkSession, | ||
| // In older version(prior to 2.1) of Spark, the table schema can be empty and should be | ||
| // inferred at runtime. We should still support it. | ||
| userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema), | ||
| partitionColumns = table.partitionColumnNames, | ||
| bucketSpec = table.bucketSpec, | ||
| className = table.provider.get, | ||
| options = table.storage.properties ++ pathOption, | ||
| // TODO: improve `InMemoryCatalog` and remove this limitation. | ||
| catalogTable = if (withHiveSupport) Some(table) else None) | ||
|
|
||
| LogicalRelation(dataSource.resolveRelation(), catalogTable = Some(table)) | ||
|
||
| } | ||
| }) | ||
| } | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case i @ InsertIntoTable(s: SimpleCatalogRelation, _, _, _, _) | ||
| if DDLUtils.isDatasourceTable(s.metadata) => | ||
| i.copy(table = readDataSourceTable(sparkSession, s)) | ||
| i.copy(table = readDataSourceTable(s.metadata)) | ||
|
|
||
| case s: SimpleCatalogRelation if DDLUtils.isDatasourceTable(s.metadata) => | ||
| readDataSourceTable(sparkSession, s) | ||
| readDataSourceTable(s.metadata) | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1626,17 +1626,6 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { | |
| assert(d.size == d.distinct.size) | ||
| } | ||
|
|
||
| test("SPARK-17625: data source table in InMemoryCatalog should guarantee output consistency") { | ||
| val tableName = "tbl" | ||
| withTable(tableName) { | ||
| spark.range(10).select('id as 'i, 'id as 'j).write.saveAsTable(tableName) | ||
| val relation = spark.sessionState.catalog.lookupRelation(TableIdentifier(tableName)) | ||
| val expr = relation.resolve("i") | ||
| val qe = spark.sessionState.executePlan(Project(Seq(expr), relation)) | ||
| qe.assertAnalyzed() | ||
| } | ||
| } | ||
|
|
||
| private def verifyNullabilityInFilterExec( | ||
|
||
| df: DataFrame, | ||
| expr: String, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1790,7 +1790,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { | |
| } | ||
|
|
||
| test("SET LOCATION for managed table") { | ||
| withTable("src") { | ||
| withTable("tbl") { | ||
| withTempDir { dir => | ||
| sql("CREATE TABLE tbl(i INT) USING parquet") | ||
| sql("INSERT INTO tbl SELECT 1") | ||
|
|
@@ -1799,6 +1799,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { | |
| .getTableMetadata(TableIdentifier("tbl")).storage.locationUri.get | ||
|
|
||
| sql(s"ALTER TABLE tbl SET LOCATION '${dir.getCanonicalPath}'") | ||
| spark.catalog.refreshTable("tbl") | ||
|
||
| // SET LOCATION won't move data from previous table path to new table path. | ||
| assert(spark.table("tbl").count() == 0) | ||
| // the previous table path should be still there. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi, @cloud-fan .
Why not making this config in this PR? It seems to be easy.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea it's easy, but I wanna minimal the code changes so it's easier to review.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep. Sure~