-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-26356][SQL] remove SaveMode from data source v2 #24233
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
3f38569
4bd5aad
83d199d
66263c1
30f95b7
bf48a24
6753c06
34246d6
22ba355
15d2071
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,11 +30,10 @@ import org.apache.spark.sql.catalyst.plans.logical.{AppendData, InsertIntoTable, | |
| import org.apache.spark.sql.execution.SQLExecution | ||
| import org.apache.spark.sql.execution.command.DDLUtils | ||
| import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation} | ||
| import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2Utils, FileDataSourceV2, WriteToDataSourceV2} | ||
| import org.apache.spark.sql.execution.datasources.v2._ | ||
| import org.apache.spark.sql.sources.BaseRelation | ||
| import org.apache.spark.sql.sources.v2._ | ||
| import org.apache.spark.sql.sources.v2.TableCapability._ | ||
| import org.apache.spark.sql.sources.v2.writer.SupportsSaveMode | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
|
|
||
|
|
@@ -267,6 +266,23 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { | |
|
|
||
| import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ | ||
| provider.getTable(dsOptions) match { | ||
| // TODO: for backward compatibility reasons, the builtin file source needs to support all | ||
|
||
| // the save modes, which violates the semantic of `TableProvider`. Here we special-case | ||
|
||
| // file source and pass the save mode to file source directly. This hack can be removed | ||
| // after we figure out a general interface for path-based data sources. | ||
| case table: FileTable => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this hack necessary? Why not put off v2 support for path-based tables?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because we want to migrate file source to v2, to validate the API of This hack is just to work around the issue that we do not have a proper entry API for path-based data source, which I believe we will have later on. I think this is not a bad idea. We can unblock the file source migration, and we can keep the DS v2 clean (
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. File sources do not use v2 by default, so this is not a necessary change for this commit. I think it should be in a separate PR. We can discuss whether it is a good idea to add this hack in the next DSv2 sync. Please remove it from this PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm adding this to my agenda for the sync-up.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe that we agreed in the sync-up to remove this hack. Is there a reason why it is still included?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I explained in the comment: #24233 (comment) Removing the hack breaks several tests and I'd like to do it in another PR. Since |
||
| val write = table.newWriteBuilder(dsOptions).asInstanceOf[FileWriteBuilder] | ||
| .mode(mode) | ||
| .withQueryId(UUID.randomUUID().toString) | ||
| .withInputDataSchema(df.logicalPlan.schema) | ||
| .buildForBatch() | ||
| // The returned `Write` can be null, which indicates that we can skip writing. | ||
| if (write != null) { | ||
| runCommand(df.sparkSession, "save") { | ||
| WriteToDataSourceV2(write, df.logicalPlan) | ||
| } | ||
| } | ||
|
|
||
| case table: SupportsWrite if table.supports(BATCH_WRITE) => | ||
| lazy val relation = DataSourceV2Relation.create(table, dsOptions) | ||
| mode match { | ||
|
|
@@ -282,24 +298,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { | |
| } | ||
|
|
||
| case _ => | ||
| table.newWriteBuilder(dsOptions) match { | ||
| case writeBuilder: SupportsSaveMode => | ||
| val write = writeBuilder.mode(mode) | ||
| .withQueryId(UUID.randomUUID().toString) | ||
| .withInputDataSchema(df.logicalPlan.schema) | ||
| .buildForBatch() | ||
| // It can only return null with `SupportsSaveMode`. We can clean it up after | ||
| // removing `SupportsSaveMode`. | ||
| if (write != null) { | ||
| runCommand(df.sparkSession, "save") { | ||
| WriteToDataSourceV2(write, df.logicalPlan) | ||
| } | ||
| } | ||
|
|
||
| case _ => | ||
| throw new AnalysisException( | ||
| s"data source ${table.name} does not support SaveMode $mode") | ||
| } | ||
| throw new AnalysisException(s"TableProvider implementation $source cannot be " + | ||
| "written with ErrorIfExists or Ignore modes, please use Append or Overwrite " + | ||
| "modes instead.") | ||
| } | ||
|
|
||
| // Streaming also uses the data source V2 API. So it may be that the data source implements | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,6 @@ import java.util | |
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
| import org.apache.spark.sql.SaveMode | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.sources.DataSourceRegister | ||
| import org.apache.spark.sql.sources.v2._ | ||
|
|
@@ -46,9 +45,7 @@ private[noop] object NoopTable extends Table with SupportsWrite with SupportsStr | |
| override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_WRITE).asJava | ||
| } | ||
|
|
||
| private[noop] object NoopWriteBuilder extends WriteBuilder | ||
| with SupportsSaveMode with SupportsTruncate { | ||
| override def mode(mode: SaveMode): WriteBuilder = this | ||
| private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since now DSV2(except file sources) can only write to an existing table, here the write path of NoopDataSource will still fail (analyzer rule |
||
| override def truncate(): WriteBuilder = this | ||
| override def buildForBatch(): BatchWrite = NoopBatchWrite | ||
| override def buildForStreaming(): StreamingWrite = NoopStreamingWrite | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,14 +25,13 @@ import org.apache.spark.{SparkEnv, SparkException, TaskContext} | |
| import org.apache.spark.executor.CommitDeniedException | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.SaveMode | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions.Attribute | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} | ||
| import org.apache.spark.sql.sources.{AlwaysTrue, Filter} | ||
| import org.apache.spark.sql.sources.v2.SupportsWrite | ||
| import org.apache.spark.sql.sources.v2.writer.{BatchWrite, DataWriterFactory, SupportsDynamicOverwrite, SupportsOverwrite, SupportsSaveMode, SupportsTruncate, WriteBuilder, WriterCommitMessage} | ||
| import org.apache.spark.sql.sources.v2.writer.{BatchWrite, DataWriterFactory, SupportsDynamicOverwrite, SupportsOverwrite, SupportsTruncate, WriteBuilder, WriterCommitMessage} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: import org.apache.spark.sql.sources.v2.writer._
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't agree with this. Wildcard imports make it difficult to cherry-pick commits and increase conflicts. It is also difficult to see where symbols are coming from and pollutes the namespace with everything in a package instead of just the required names. For example, I recently hit problems adding a |
||
| import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
| import org.apache.spark.util.{LongAccumulator, Utils} | ||
|
|
||
|
|
@@ -58,13 +57,7 @@ case class AppendDataExec( | |
| query: SparkPlan) extends V2TableWriteExec with BatchWriteHelper { | ||
|
|
||
| override protected def doExecute(): RDD[InternalRow] = { | ||
| val batchWrite = newWriteBuilder() match { | ||
| case builder: SupportsSaveMode => | ||
| builder.mode(SaveMode.Append).buildForBatch() | ||
|
|
||
| case builder => | ||
| builder.buildForBatch() | ||
| } | ||
| val batchWrite = newWriteBuilder().buildForBatch() | ||
| doWrite(batchWrite) | ||
| } | ||
| } | ||
|
|
@@ -94,9 +87,6 @@ case class OverwriteByExpressionExec( | |
| case builder: SupportsTruncate if isTruncate(deleteWhere) => | ||
| builder.truncate().buildForBatch() | ||
|
|
||
| case builder: SupportsSaveMode if isTruncate(deleteWhere) => | ||
| builder.mode(SaveMode.Overwrite).buildForBatch() | ||
|
|
||
| case builder: SupportsOverwrite => | ||
| builder.overwrite(deleteWhere).buildForBatch() | ||
|
|
||
|
|
@@ -127,9 +117,6 @@ case class OverwritePartitionsDynamicExec( | |
| case builder: SupportsDynamicOverwrite => | ||
| builder.overwriteDynamicPartitions().buildForBatch() | ||
|
|
||
| case builder: SupportsSaveMode => | ||
| builder.mode(SaveMode.Overwrite).buildForBatch() | ||
|
|
||
| case _ => | ||
| throw new SparkException(s"Table does not support dynamic partition overwrite: $table") | ||
| } | ||
|
|
@@ -292,8 +279,8 @@ object DataWritingSparkTask extends Logging { | |
| } | ||
|
|
||
| private[v2] case class DataWritingSparkTaskResult( | ||
| numRows: Long, | ||
| writerCommitMessage: WriterCommitMessage) | ||
| numRows: Long, | ||
| writerCommitMessage: WriterCommitMessage) | ||
|
|
||
| /** | ||
| * Sink progress information collected after commit. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,7 @@ import scala.collection.JavaConverters._ | |
| import test.org.apache.spark.sql.sources.v2._ | ||
|
|
||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.sql.{DataFrame, QueryTest, Row} | ||
| import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, DataSourceV2Relation} | ||
| import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchangeExec} | ||
|
|
@@ -218,36 +218,30 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { | |
| val path = file.getCanonicalPath | ||
| assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty) | ||
|
|
||
| spark.range(10).select('id as 'i, -'id as 'j).write.format(cls.getName) | ||
| .option("path", path).save() | ||
| checkAnswer( | ||
| spark.read.format(cls.getName).option("path", path).load(), | ||
| spark.range(10).select('id, -'id)) | ||
|
|
||
| // test with different save modes | ||
| spark.range(10).select('id as 'i, -'id as 'j).write.format(cls.getName) | ||
| .option("path", path).mode("append").save() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this change is needed because the default mode for v2 is append.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before my PR, we write the files twice: once without the save mode, once with append mode. Now I switch order, to make sure that the second write doesn't specify save mode, and prove the default mode is append. |
||
| checkAnswer( | ||
| spark.read.format(cls.getName).option("path", path).load(), | ||
| spark.range(10).union(spark.range(10)).select('id, -'id)) | ||
| spark.range(10).select('id, -'id)) | ||
|
|
||
| spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName) | ||
| .option("path", path).mode("overwrite").save() | ||
| checkAnswer( | ||
| spark.read.format(cls.getName).option("path", path).load(), | ||
| spark.range(5).select('id, -'id)) | ||
|
|
||
| spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName) | ||
| .option("path", path).mode("ignore").save() | ||
| checkAnswer( | ||
| spark.read.format(cls.getName).option("path", path).load(), | ||
| spark.range(5).select('id, -'id)) | ||
| val e = intercept[AnalysisException] { | ||
| spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName) | ||
| .option("path", path).mode("ignore").save() | ||
| } | ||
| assert(e.message.contains("please use Append or Overwrite modes instead")) | ||
|
|
||
| val e = intercept[Exception] { | ||
| val e2 = intercept[AnalysisException] { | ||
| spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName) | ||
| .option("path", path).mode("error").save() | ||
| } | ||
| assert(e.getMessage.contains("data already exists")) | ||
| assert(e2.getMessage.contains("please use Append or Overwrite modes instead")) | ||
|
|
||
| // test transaction | ||
| val failingUdf = org.apache.spark.sql.functions.udf { | ||
|
|
@@ -262,10 +256,10 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { | |
| } | ||
| // this input data will fail to read middle way. | ||
| val input = spark.range(10).select(failingUdf('id).as('i)).select('i, -'i as 'j) | ||
| val e2 = intercept[SparkException] { | ||
| val e3 = intercept[SparkException] { | ||
| input.write.format(cls.getName).option("path", path).mode("overwrite").save() | ||
| } | ||
| assert(e2.getMessage.contains("Writing job aborted")) | ||
| assert(e3.getMessage.contains("Writing job aborted")) | ||
| // make sure we don't have partial data. | ||
| assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty) | ||
| } | ||
|
|
@@ -280,7 +274,7 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { | |
|
|
||
| val numPartition = 6 | ||
| spark.range(0, 10, 1, numPartition).select('id as 'i, -'id as 'j).write.format(cls.getName) | ||
| .option("path", path).save() | ||
| .option("path", path).mode("append").save() | ||
| checkAnswer( | ||
| spark.read.format(cls.getName).option("path", path).load(), | ||
| spark.range(10).select('id, -'id)) | ||
|
|
@@ -367,31 +361,13 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { | |
| val format = classOf[SimpleWritableDataSource].getName | ||
|
|
||
| val df = Seq((1L, 2L)).toDF("i", "j") | ||
| df.write.format(format).option("path", optionPath).save() | ||
| df.write.format(format).option("path", optionPath).mode("append").save() | ||
| assert(!new File(sessionPath).exists) | ||
| checkAnswer(spark.read.format(format).option("path", optionPath).load(), df) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-25700: do not read schema when writing in other modes except append and overwrite") { | ||
|
||
| withTempPath { file => | ||
| val cls = classOf[SimpleWriteOnlyDataSource] | ||
| val path = file.getCanonicalPath | ||
| val df = spark.range(5).select('id as 'i, -'id as 'j) | ||
| // non-append mode should not throw exception, as they don't access schema. | ||
| df.write.format(cls.getName).option("path", path).mode("error").save() | ||
| df.write.format(cls.getName).option("path", path).mode("ignore").save() | ||
| // append and overwrite modes will access the schema and should throw exception. | ||
| intercept[SchemaReadAttemptException] { | ||
| df.write.format(cls.getName).option("path", path).mode("append").save() | ||
| } | ||
| intercept[SchemaReadAttemptException] { | ||
| df.write.format(cls.getName).option("path", path).mode("overwrite").save() | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The phrase "deal with" is not very specific. I think it would be better to say what these tables can be used for: data operations like read, append, delete, and overwrite. Not operations that require metadata changes.