-
Notifications
You must be signed in to change notification settings - Fork 29.3k
[SPARK-28565][SQL] DataFrameWriter saveAsTable support for V2 catalogs #25330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
42e231f
cd8d4a6
4797234
f9c2f95
6f3c106
dbb7e1f
953da51
629f3d2
554507a
b584359
cc4d349
57f469d
d0ab258
a67df29
50f1eef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,12 +22,13 @@ import java.util.{Locale, Properties, UUID} | |
| import scala.collection.JavaConverters._ | ||
|
|
||
| import org.apache.spark.annotation.Stable | ||
| import org.apache.spark.sql.catalog.v2.{CatalogPlugin, Identifier} | ||
| import org.apache.spark.sql.catalog.v2.{CatalogPlugin, Identifier, TableCatalog} | ||
| import org.apache.spark.sql.catalog.v2.expressions._ | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation} | ||
| import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, NoSuchTableException, UnresolvedRelation} | ||
| import org.apache.spark.sql.catalyst.catalog._ | ||
| import org.apache.spark.sql.catalyst.expressions.Literal | ||
| import org.apache.spark.sql.catalyst.plans.logical.{AppendData, InsertIntoTable, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic} | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.execution.SQLExecution | ||
| import org.apache.spark.sql.execution.command.DDLUtils | ||
| import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, DataSourceUtils, LogicalRelation} | ||
|
|
@@ -37,7 +38,7 @@ import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} | |
| import org.apache.spark.sql.sources.BaseRelation | ||
| import org.apache.spark.sql.sources.v2._ | ||
| import org.apache.spark.sql.sources.v2.TableCapability._ | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.sql.types.{IntegerType, StructType} | ||
| import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
|
|
||
| /** | ||
|
|
@@ -485,7 +486,80 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { | |
| * @since 1.4.0 | ||
| */ | ||
| def saveAsTable(tableName: String): Unit = { | ||
| saveAsTable(df.sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)) | ||
| import df.sparkSession.sessionState.analyzer.{AsTableIdentifier, CatalogObjectIdentifier} | ||
| import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ | ||
|
|
||
| val session = df.sparkSession | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: the local variable is used only once.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll use it in the follow up :) |
||
| val useV1Sources = | ||
| session.sessionState.conf.useV1SourceWriterList.toLowerCase(Locale.ROOT).split(",") | ||
| val cls = DataSource.lookupDataSource(source, session.sessionState.conf) | ||
| val shouldUseV1Source = cls.newInstance() match { | ||
| case d: DataSourceRegister if useV1Sources.contains(d.shortName()) => true | ||
| case _ => useV1Sources.contains(cls.getCanonicalName.toLowerCase(Locale.ROOT)) | ||
| } | ||
|
|
||
| val canUseV2 = !shouldUseV1Source && classOf[TableProvider].isAssignableFrom(cls) | ||
| val sessionCatalogOpt = session.sessionState.analyzer.sessionCatalog | ||
|
|
||
| session.sessionState.sqlParser.parseMultipartIdentifier(tableName) match { | ||
| case CatalogObjectIdentifier(Some(catalog), ident) => | ||
| saveAsTable(catalog.asTableCatalog, ident, modeForDSV2) | ||
|
|
||
| case CatalogObjectIdentifier(None, ident) if canUseV2 && sessionCatalogOpt.isDefined => | ||
| // We pass in the modeForDSV1, as using the V2 session catalog should maintain compatibility | ||
| // for now. | ||
| saveAsTable(sessionCatalogOpt.get.asTableCatalog, ident, modeForDSV1) | ||
|
|
||
| case AsTableIdentifier(tableIdentifier) => | ||
|
brkyvz marked this conversation as resolved.
|
||
| saveAsTable(tableIdentifier) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about cases where |
||
| } | ||
| } | ||
|
|
||
|
|
||
| private def saveAsTable(catalog: TableCatalog, ident: Identifier, mode: SaveMode): Unit = { | ||
| val partitioning = partitioningColumns.map { colNames => | ||
| colNames.map(name => IdentityTransform(FieldReference(name))) | ||
| }.getOrElse(Seq.empty[Transform]) | ||
| val bucketing = bucketColumnNames.map { cols => | ||
| Seq(BucketTransform(LiteralValue(numBuckets.get, IntegerType), cols.map(FieldReference(_)))) | ||
| }.getOrElse(Seq.empty[Transform]) | ||
| val partitionTransforms = partitioning ++ bucketing | ||
|
|
||
| val tableOpt = try Option(catalog.loadTable(ident)) catch { | ||
| case _: NoSuchTableException => None | ||
| } | ||
|
|
||
| val command = (mode, tableOpt) match { | ||
| case (SaveMode.Append, Some(table)) => | ||
| AppendData.byName(DataSourceV2Relation.create(table), df.logicalPlan) | ||
|
brkyvz marked this conversation as resolved.
|
||
|
|
||
| case (SaveMode.Overwrite, _) => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that the behavior here is to truncate and write, not replace.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we may want to change the table properties though, such as partitioning and schema, wouldn't we?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to match the behavior of v1 file sources. Do we know what that behavior is?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed in DSV2 sync, the old behavior was to drop the old table and create a new one, therefore the Replace behavior here works. |
||
| ReplaceTableAsSelect( | ||
| catalog, | ||
| ident, | ||
| partitionTransforms, | ||
| df.queryExecution.analyzed, | ||
| Map.empty, // properties can't be specified through this API | ||
| extraOptions.toMap, | ||
| orCreate = true) // Create the table if it doesn't exist | ||
|
|
||
| case (other, _) => | ||
| // We have a potential race condition here in AppendMode, if the table suddenly gets | ||
| // created between our existence check and physical execution, but this can't be helped | ||
| // in any case. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we treat |
||
| CreateTableAsSelect( | ||
| catalog, | ||
| ident, | ||
| partitionTransforms, | ||
| df.queryExecution.analyzed, | ||
| Map.empty, | ||
| extraOptions.toMap, | ||
| ignoreIfExists = other == SaveMode.Ignore) | ||
| } | ||
|
|
||
| runCommand(df.sparkSession, "saveAsTable") { | ||
| command | ||
| } | ||
| } | ||
|
|
||
| private def saveAsTable(tableIdent: TableIdentifier): Unit = { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,21 +17,41 @@ | |
|
|
||
| package org.apache.spark.sql.sources.v2 | ||
|
|
||
| import org.scalatest.BeforeAndAfter | ||
| import java.util | ||
| import java.util.concurrent.ConcurrentHashMap | ||
|
|
||
| import org.apache.spark.sql.QueryTest | ||
| import scala.collection.JavaConverters._ | ||
| import scala.collection.mutable | ||
|
|
||
| import org.scalatest.{BeforeAndAfter, PrivateMethodTester} | ||
|
|
||
| import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SparkSession} | ||
| import org.apache.spark.sql.catalog.v2.{CatalogPlugin, Identifier} | ||
| import org.apache.spark.sql.catalog.v2.expressions.Transform | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException | ||
| import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFormat, CatalogTable, CatalogTableType} | ||
| import org.apache.spark.sql.execution.QueryExecution | ||
| import org.apache.spark.sql.execution.datasources.v2.V2SessionCatalog | ||
| import org.apache.spark.sql.execution.datasources.v2.parquet.{ParquetDataSourceV2, ParquetTable} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode} | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.sql.util.{CaseInsensitiveStringMap, QueryExecutionListener} | ||
|
|
||
| class DataSourceV2DataFrameSuite extends QueryTest with SharedSQLContext with BeforeAndAfter { | ||
| import testImplicits._ | ||
|
|
||
| private val v2Format = classOf[InMemoryTableProvider].getName | ||
| private val dfData = Seq((1L, "a"), (2L, "b"), (3L, "c")) | ||
| private val catalogName = "testcat" | ||
|
|
||
| before { | ||
| spark.conf.set("spark.sql.catalog.testcat", classOf[TestInMemoryTableCatalog].getName) | ||
| spark.conf.set(s"spark.sql.catalog.$catalogName", classOf[TestInMemoryTableCatalog].getName) | ||
| spark.conf.set("spark.sql.catalog.testcat2", classOf[TestInMemoryTableCatalog].getName) | ||
|
|
||
| val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") | ||
| df.createOrReplaceTempView("source") | ||
| spark.createDataFrame(dfData).toDF("id", "data").createOrReplaceTempView("source") | ||
| val df2 = spark.createDataFrame(Seq((4L, "d"), (5L, "e"), (6L, "f"))).toDF("id", "data") | ||
| df2.createOrReplaceTempView("source2") | ||
| } | ||
|
|
@@ -42,6 +62,19 @@ class DataSourceV2DataFrameSuite extends QueryTest with SharedSQLContext with Be | |
| spark.sql("DROP VIEW source2") | ||
| } | ||
|
|
||
| private def sessionCatalogTest(testName: String)(f: SparkSession => Unit): Unit = { | ||
| test("using session catalog: " + testName) { | ||
| val catalogConf = SQLConf.V2_SESSION_CATALOG | ||
| val newSession = spark.newSession() | ||
| newSession.createDataFrame(dfData).toDF("id", "data").createOrReplaceTempView("source") | ||
| newSession.sessionState.conf.setConf(catalogConf, classOf[TestV2SessionCatalog].getName) | ||
| try f(newSession) finally { | ||
| newSession.catalog("session").asInstanceOf[TestV2SessionCatalog].clearTables() | ||
| newSession.sql("DROP VIEW source") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("insertInto: append") { | ||
| val t1 = "testcat.ns1.ns2.tbl" | ||
| withTable(t1) { | ||
|
|
@@ -104,4 +137,169 @@ class DataSourceV2DataFrameSuite extends QueryTest with SharedSQLContext with Be | |
| } | ||
| } | ||
| } | ||
|
|
||
| testQuietly("saveAsTable: with defined catalog and table doesn't exist") { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This and many other tests don't need |
||
| val t1 = "testcat.ns1.ns2.tbl" | ||
| withTable(t1) { | ||
| spark.table("source").write.saveAsTable(t1) | ||
| checkAnswer(spark.table(t1), spark.table("source")) | ||
| } | ||
| } | ||
|
|
||
| testQuietly("saveAsTable: with defined catalog and table exists") { | ||
| val t1 = "testcat.ns1.ns2.tbl" | ||
| withTable(t1) { | ||
| sql(s"CREATE TABLE $t1 (id bigint, data string) USING foo") | ||
| // Default saveMode is append, therefore this doesn't throw a table already exists eception | ||
| spark.table("source").write.saveAsTable(t1) | ||
| checkAnswer(spark.table(t1), spark.table("source")) | ||
| } | ||
| } | ||
|
|
||
| testQuietly("saveAsTable: with defined catalog + table overwrite and table doesn't exist") { | ||
| val t1 = "testcat.ns1.ns2.tbl" | ||
| withTable(t1) { | ||
| spark.table("source").write.mode("overwrite").saveAsTable(t1) | ||
| checkAnswer(spark.table(t1), spark.table("source")) | ||
| } | ||
| } | ||
|
|
||
| testQuietly("saveAsTable: with defined catalog + table overwrite and table exists") { | ||
| val t1 = "testcat.ns1.ns2.tbl" | ||
| withTable(t1) { | ||
| sql(s"CREATE TABLE $t1 USING foo AS SELECT 'c', 'd'") | ||
| spark.table("source").write.mode("overwrite").saveAsTable(t1) | ||
| checkAnswer(spark.table(t1), spark.table("source")) | ||
| } | ||
| } | ||
|
|
||
| testQuietly("saveAsTable: with defined catalog + ignore mode and table doesn't exist") { | ||
| val t1 = "testcat.ns1.ns2.tbl" | ||
| withTable(t1) { | ||
| spark.table("source").write.mode("ignore").saveAsTable(t1) | ||
| checkAnswer(spark.table(t1), spark.table("source")) | ||
| } | ||
| } | ||
|
|
||
| testQuietly("saveAsTable: with defined catalog + ignore mode and table exists") { | ||
| val t1 = "testcat.ns1.ns2.tbl" | ||
| withTable(t1) { | ||
| sql(s"CREATE TABLE $t1 USING foo AS SELECT 'c', 'd'") | ||
| spark.table("source").write.mode("ignore").saveAsTable(t1) | ||
| checkAnswer(spark.table(t1), Seq(Row("c", "d"))) | ||
| } | ||
| } | ||
|
|
||
| sessionCatalogTest("saveAsTable and v2 table - table doesn't exist") { session => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider moving all session catalog tests to a new suite file? |
||
| val t1 = "tbl" | ||
| session.table("source").write.format(v2Format).saveAsTable(t1) | ||
| checkAnswer(session.table(t1), session.table("source")) | ||
| } | ||
|
|
||
| sessionCatalogTest("saveAsTable: v2 table - table exists") { session => | ||
| val t1 = "tbl" | ||
| session.sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format") | ||
| intercept[TableAlreadyExistsException] { | ||
| session.table("source").select("id", "data").write.format(v2Format).saveAsTable(t1) | ||
| } | ||
| session.table("source").write.format(v2Format).mode("append").saveAsTable(t1) | ||
| checkAnswer(session.table(t1), session.table("source")) | ||
| } | ||
|
|
||
| sessionCatalogTest("saveAsTable: v2 table - table overwrite and table doesn't exist") { session => | ||
| val t1 = "tbl" | ||
| session.table("source").write.format(v2Format).mode("overwrite").saveAsTable(t1) | ||
| checkAnswer(session.table(t1), session.table("source")) | ||
| } | ||
|
|
||
| sessionCatalogTest("saveAsTable: v2 table - table overwrite and table exists") { session => | ||
| val t1 = "tbl" | ||
| session.sql(s"CREATE TABLE $t1 USING $v2Format AS SELECT 'c', 'd'") | ||
| session.table("source").write.format(v2Format).mode("overwrite").saveAsTable(t1) | ||
| checkAnswer(session.table(t1), session.table("source")) | ||
| } | ||
|
|
||
| sessionCatalogTest("saveAsTable: v2 table - ignore mode and table doesn't exist") { session => | ||
| val t1 = "tbl" | ||
| session.table("source").write.format(v2Format).mode("ignore").saveAsTable(t1) | ||
| checkAnswer(session.table(t1), session.table("source")) | ||
| } | ||
|
|
||
| sessionCatalogTest("saveAsTable: v2 table - ignore mode and table exists") { session => | ||
| val t1 = "tbl" | ||
| session.sql(s"CREATE TABLE $t1 USING $v2Format AS SELECT 'c', 'd'") | ||
| session.table("source").write.format(v2Format).mode("ignore").saveAsTable(t1) | ||
| checkAnswer(session.table(t1), Seq(Row("c", "d"))) | ||
| } | ||
|
|
||
| sessionCatalogTest("saveAsTable: old table defined in a database colliding " + | ||
| "with a catalog name") { session => | ||
| // Make sure the database name conflicts with a catalog name | ||
| val dbPath = session.sessionState.catalog.getDefaultDBPath(catalogName) | ||
| session.sessionState.catalog.createDatabase( | ||
| CatalogDatabase(catalogName, "", dbPath, Map.empty), ignoreIfExists = false) | ||
| val t1 = "tbl" | ||
| withTable(t1) { | ||
| // Create the table in the built in catalog, in the given database | ||
| session.sessionState.catalog.createTable( | ||
| CatalogTable( | ||
| identifier = TableIdentifier(t1, Some(catalogName)), | ||
| tableType = CatalogTableType.MANAGED, | ||
| provider = Some(v2Format), | ||
| storage = CatalogStorageFormat(None, None, None, None, false, Map.empty), | ||
| schema = session.table("source").schema | ||
| ), | ||
| ignoreIfExists = false | ||
| ) | ||
| val tableName = s"$catalogName.$t1" | ||
| checkAnswer(session.table(tableName), Nil) | ||
| intercept[TableAlreadyExistsException] { | ||
| // Make sure default save mode is same as before | ||
| session.table("source").write.format(v2Format).saveAsTable(tableName) | ||
| } | ||
| session.table("source").write.format(v2Format).mode("append").saveAsTable(tableName) | ||
| checkAnswer(session.table(tableName), session.table("source")) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| class InMemoryTableProvider extends TableProvider { | ||
| override def getTable(options: CaseInsensitiveStringMap): Table = { | ||
| throw new UnsupportedOperationException("D'oh!") | ||
| } | ||
| } | ||
|
|
||
| /** A SessionCatalog that always loads an in memory Table, so we can test write code paths. */ | ||
| class TestV2SessionCatalog extends V2SessionCatalog { | ||
|
|
||
| protected val tables: util.Map[Identifier, InMemoryTable] = | ||
| new ConcurrentHashMap[Identifier, InMemoryTable]() | ||
|
|
||
| override def loadTable(ident: Identifier): Table = { | ||
| if (tables.containsKey(ident)) { | ||
| tables.get(ident) | ||
| } else { | ||
| // Table was created through the built-in catalog | ||
| val t = super.loadTable(ident) | ||
| val table = new InMemoryTable(t.name(), t.schema(), t.partitioning(), t.properties()) | ||
| tables.put(ident, table) | ||
| table | ||
| } | ||
| } | ||
|
|
||
| override def createTable( | ||
| ident: Identifier, | ||
| schema: StructType, | ||
| partitions: Array[Transform], | ||
| properties: util.Map[String, String]): Table = { | ||
| val t = new InMemoryTable(ident.name(), schema, partitions, properties) | ||
| tables.put(ident, t) | ||
| t | ||
| } | ||
|
|
||
| def clearTables(): Unit = { | ||
| assert(!tables.isEmpty, "Tables were empty, maybe didn't use the session catalog code path?") | ||
| tables.keySet().asScala.foreach(super.dropTable) | ||
| tables.clear() | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's the tricky thing. We need to see if we can load using the defined catalog first. If the catalog is defined, but it doesn't return a table, then we miss out on using the session catalog.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When a user tries to use table
cat1.ns1.ns2.tbl, wherecat1exists, but without tablens1.ns2.tblcat1.ns1.ns2.tblIMHO, Spark should throw no table found exception. This is consistent with the follow statement in #24768:
Maybe I missed some discussions about fallback?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I disagree.
Determining which catalog is responsible for an identifier is independent of the catalog results. If
maybeCatalogisNone, then the session catalog is responsible for the identifier. Whether to use v1 or v2 after that point depends on the provider.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@rdblue Imagine the following use case. Let's say as the data science team, I have a database on the Hive MetaStore called
prod. I have tables named like:prod.tbl1,prod.tbl2. Now, some other team (maybe data engineering team), creates a v2 catalog calledprodand adds it to my environment.Won't the
prodcatalog hijack all my requests, and start failing to resolve the tables I had declared before, which I was expecting the V2SessionCatalog to resolve? I, as an unaware user, suddenly I have all my jobs failing and chaos ensues.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. A conflict could break queries in both directions when there is a conflict between namespace
prodand catalogprod:When we wrote the SPIP, the choice was to make catalog take precedence because:
In short, we expect fewer problems when catalogs take precedence.