-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-40737][CONNECT] Add basic support for DataFrameWriter #38192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c4ae79e
6d152e2
cdf41d6
139873e
36d320b
7a3a16c
12a06fe
f681c72
5af6799
dc200c8
1d70250
85f81c0
0dc03ac
a13f9e4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,8 @@ | |
|
|
||
| syntax = 'proto3'; | ||
|
|
||
| import "spark/connect/expressions.proto"; | ||
| import "spark/connect/relations.proto"; | ||
| import "spark/connect/types.proto"; | ||
|
|
||
| package spark.connect; | ||
|
|
@@ -29,6 +31,7 @@ option java_package = "org.apache.spark.connect.proto"; | |
| message Command { | ||
| oneof command_type { | ||
| CreateScalarFunction create_function = 1; | ||
| WriteOperation write_operation = 2; | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -62,3 +65,39 @@ message CreateScalarFunction { | |
| FUNCTION_LANGUAGE_SCALA = 3; | ||
| } | ||
| } | ||
|
|
||
| // As writes are not directly handled during analysis and planning, they are modeled as commands. | ||
| message WriteOperation { | ||
| // The output of the `input` relation will be persisted according to the options. | ||
| Relation input = 1; | ||
| // Format value according to the Spark documentation. Examples are: text, parquet, delta. | ||
| string source = 2; | ||
| // The destination of the write operation must be either a path or a table. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in DF API, people can do |
||
| oneof save_type { | ||
| string path = 3; | ||
| string table_name = 4; | ||
| } | ||
| SaveMode mode = 5; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We added Anyway, we need to support save mode in the proto definition to support the existing DF API. If we want to support |
||
| // List of columns to sort the output by. | ||
| repeated string sort_column_names = 6; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be part of the BucketBy |
||
| // List of columns for partitioning. | ||
| repeated string partitioning_columns = 7; | ||
| // Optional bucketing specification. Bucketing must set the number of buckets and the columns | ||
| // to bucket by. | ||
| BucketBy bucket_by = 8; | ||
| // Optional list of configuration options. | ||
| map<string, string> options = 9; | ||
|
|
||
| message BucketBy { | ||
| repeated string bucket_column_names = 1; | ||
| int32 num_buckets = 2; | ||
| } | ||
|
|
||
| enum SaveMode { | ||
| SAVE_MODE_UNSPECIFIED = 0; | ||
| SAVE_MODE_APPEND = 1; | ||
| SAVE_MODE_OVERWRITE = 2; | ||
| SAVE_MODE_ERROR_IF_EXISTS = 3; | ||
| SAVE_MODE_IGNORE = 4; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,10 +24,16 @@ import com.google.common.collect.{Lists, Maps} | |
| import org.apache.spark.annotation.{Since, Unstable} | ||
| import org.apache.spark.api.python.{PythonEvalType, SimplePythonFunction} | ||
| import org.apache.spark.connect.proto | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.connect.proto.WriteOperation | ||
| import org.apache.spark.sql.{Dataset, SparkSession} | ||
| import org.apache.spark.sql.connect.planner.{DataTypeProtoConverter, SparkConnectPlanner} | ||
| import org.apache.spark.sql.execution.python.UserDefinedPythonFunction | ||
| import org.apache.spark.sql.types.StringType | ||
|
|
||
| final case class InvalidCommandInput( | ||
|
||
| private val message: String = "", | ||
| private val cause: Throwable = null) | ||
| extends Exception(message, cause) | ||
|
|
||
| @Unstable | ||
| @Since("3.4.0") | ||
|
|
@@ -40,6 +46,8 @@ class SparkConnectCommandPlanner(session: SparkSession, command: proto.Command) | |
| command.getCommandTypeCase match { | ||
| case proto.Command.CommandTypeCase.CREATE_FUNCTION => | ||
| handleCreateScalarFunction(command.getCreateFunction) | ||
| case proto.Command.CommandTypeCase.WRITE_OPERATION => | ||
| handleWriteOperation(command.getWriteOperation) | ||
| case _ => throw new UnsupportedOperationException(s"$command not supported.") | ||
| } | ||
| } | ||
|
|
@@ -74,4 +82,64 @@ class SparkConnectCommandPlanner(session: SparkSession, command: proto.Command) | |
| session.udf.registerPython(cf.getPartsList.asScala.head, udf) | ||
| } | ||
|
|
||
| /** | ||
| * Transforms the write operation and executes it. | ||
| * | ||
| * The input write operation contains a reference to the input plan and transforms it to the | ||
| * corresponding logical plan. Afterwards, creates the DataFrameWriter and translates the | ||
| * parameters of the WriteOperation into the corresponding methods calls. | ||
| * | ||
| * @param writeOperation | ||
| */ | ||
| def handleWriteOperation(writeOperation: WriteOperation): Unit = { | ||
|
||
| // Transform the input plan into the logical plan. | ||
| val planner = new SparkConnectPlanner(writeOperation.getInput, session) | ||
| val plan = planner.transform() | ||
| // And create a Dataset from the plan. | ||
| val dataset = Dataset.ofRows(session, logicalPlan = plan) | ||
|
|
||
| val w = dataset.write | ||
| if (writeOperation.getMode != proto.WriteOperation.SaveMode.SAVE_MODE_UNSPECIFIED) { | ||
| w.mode(DataTypeProtoConverter.toSaveMode(writeOperation.getMode)) | ||
| } | ||
|
|
||
| if (writeOperation.getOptionsCount > 0) { | ||
| writeOperation.getOptionsMap.asScala.foreach { case (key, value) => w.option(key, value) } | ||
| } | ||
|
|
||
| if (writeOperation.getSortColumnNamesCount > 0) { | ||
| val names = writeOperation.getSortColumnNamesList.asScala | ||
| w.sortBy(names.head, names.tail.toSeq: _*) | ||
| } | ||
|
|
||
| if (writeOperation.hasBucketBy) { | ||
| val op = writeOperation.getBucketBy | ||
| val cols = op.getBucketColumnNamesList.asScala | ||
| if (op.getNumBuckets <= 0) { | ||
| throw InvalidCommandInput( | ||
| s"BucketBy must specify a bucket count > 0, received ${op.getNumBuckets} instead.") | ||
| } | ||
| w.bucketBy(op.getNumBuckets, cols.head, cols.tail.toSeq: _*) | ||
| } | ||
|
|
||
| if (writeOperation.getPartitioningColumnsCount > 0) { | ||
| val names = writeOperation.getPartitioningColumnsList.asScala | ||
| w.partitionBy(names.toSeq: _*) | ||
| } | ||
|
|
||
| if (writeOperation.getSource != null) { | ||
| w.format(writeOperation.getSource) | ||
| } | ||
|
|
||
| writeOperation.getSaveTypeCase match { | ||
| case proto.WriteOperation.SaveTypeCase.PATH => w.save(writeOperation.getPath) | ||
| case proto.WriteOperation.SaveTypeCase.TABLE_NAME => | ||
| w.saveAsTable(writeOperation.getTableName) | ||
| case _ => | ||
| throw new UnsupportedOperationException( | ||
| "WriteOperation:SaveTypeCase not supported " | ||
| + s"${writeOperation.getSaveTypeCase.getNumber}") | ||
| } | ||
| } | ||
|
|
||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.