-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-46043][SQL] Support create table using DSv2 sources #43949
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
fc1249d
b0126b8
9b28f92
ac74a36
572fea0
fa5243b
8575ce0
23177fc
75e368b
422d65e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -149,6 +149,19 @@ | |
| ], | ||
| "sqlState" : "42846" | ||
| }, | ||
| "CANNOT_CREATE_DATA_SOURCE_V2_TABLE" : { | ||
| "message" : [ | ||
| "Failed to create data source V2 table:" | ||
|
||
| ], | ||
| "subClass" : { | ||
| "EXTERNAL_METADATA_UNSUPPORTED" : { | ||
| "message" : [ | ||
| "provider '<provider>' does not support external metadata but a schema is provided. Please remove the schema when creating the table." | ||
| ] | ||
| } | ||
| }, | ||
| "sqlState" : "42KDE" | ||
| }, | ||
| "CANNOT_DECODE_URL" : { | ||
| "message" : [ | ||
| "The provided URL cannot be decoded: <url>. Please ensure that the URL is properly formatted and try again." | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| --- | ||
| layout: global | ||
| title: CANNOT_CREATE_DATA_SOURCE_V2_TABLE error class | ||
| displayTitle: CANNOT_CREATE_DATA_SOURCE_V2_TABLE error class | ||
| license: | | ||
| Licensed to the Apache Software Foundation (ASF) under one or more | ||
| contributor license agreements. See the NOTICE file distributed with | ||
| this work for additional information regarding copyright ownership. | ||
| The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| (the "License"); you may not use this file except in compliance with | ||
| the License. You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| --- | ||
|
|
||
| [SQLSTATE: 42KDE](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) | ||
|
|
||
| Failed to create data source V2 table: | ||
|
|
||
| This error class has the following derived error classes: | ||
|
|
||
| ## EXTERNAL_METADATA_UNSUPPORTED | ||
|
|
||
| provider '`<provider>`' does not support external metadata but a schema is provided. Please remove the schema when creating the table. | ||
|
|
||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelat | |
| import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Expression, SortOrder} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, ExposesMetadataColumns, Histogram, HistogramBin, LeafNode, LogicalPlan, Statistics} | ||
| import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes | ||
| import org.apache.spark.sql.catalyst.util.{truncatedString, CharVarcharUtils} | ||
| import org.apache.spark.sql.catalyst.util.{quoteIdentifier, truncatedString, CharVarcharUtils} | ||
| import org.apache.spark.sql.connector.catalog.{CatalogPlugin, FunctionCatalog, Identifier, SupportsMetadataColumns, Table, TableCapability} | ||
| import org.apache.spark.sql.connector.read.{Scan, Statistics => V2Statistics, SupportsReportStatistics} | ||
| import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream} | ||
|
|
@@ -61,7 +61,14 @@ case class DataSourceV2Relation( | |
| Nil | ||
| } | ||
|
|
||
| override def name: String = table.name() | ||
| override def name: String = { | ||
| import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ | ||
| (catalog, identifier) match { | ||
| case (Some(cat), Some(ident)) => s"${quoteIdentifier(cat.name())}.${ident.quoted}" | ||
| case (None, Some(ident)) => ident.quoted | ||
|
||
| case _ => table.name() | ||
| } | ||
| } | ||
|
|
||
| override def skipSchemaResolution: Boolean = table.supports(TableCapability.ACCEPT_ANY_SCHEMA) | ||
|
|
||
|
|
@@ -127,7 +134,7 @@ case class DataSourceV2ScanRelation( | |
| keyGroupedPartitioning: Option[Seq[Expression]] = None, | ||
| ordering: Option[Seq[SortOrder]] = None) extends LeafNode with NamedRelation { | ||
|
|
||
| override def name: String = relation.table.name() | ||
| override def name: String = relation.name | ||
|
|
||
| override def simpleString(maxFields: Int): String = { | ||
| s"RelationV2${truncatedString(output, "[", ", ", "]", maxFields)} $name" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,11 +23,12 @@ import java.util | |
| import scala.collection.mutable | ||
| import scala.jdk.CollectionConverters._ | ||
|
|
||
| import org.apache.spark.SparkUnsupportedOperationException | ||
| import org.apache.spark.sql.catalyst.{FunctionIdentifier, SQLConfHelper, TableIdentifier} | ||
| import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException, TableAlreadyExistsException} | ||
| import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTableType, CatalogUtils, ClusterBySpec, SessionCatalog} | ||
| import org.apache.spark.sql.catalyst.util.TypeUtils._ | ||
| import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogV2Util, Column, FunctionCatalog, Identifier, NamespaceChange, SupportsNamespaces, Table, TableCatalog, TableCatalogCapability, TableChange, V1Table} | ||
| import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogV2Util, Column, FunctionCatalog, Identifier, NamespaceChange, SupportsCatalogOptions, SupportsNamespaces, Table, TableCatalog, TableCatalogCapability, TableChange, V1Table} | ||
| import org.apache.spark.sql.connector.catalog.NamespaceChange.RemoveProperty | ||
| import org.apache.spark.sql.connector.catalog.functions.UnboundFunction | ||
| import org.apache.spark.sql.connector.expressions.Transform | ||
|
|
@@ -73,7 +74,45 @@ class V2SessionCatalog(catalog: SessionCatalog) | |
|
|
||
| override def loadTable(ident: Identifier): Table = { | ||
| try { | ||
| V1Table(catalog.getTableMetadata(ident.asTableIdentifier)) | ||
| val table = catalog.getTableMetadata(ident.asTableIdentifier) | ||
| if (table.provider.isDefined) { | ||
| DataSourceV2Utils.getTableProvider(table.provider.get, conf) match { | ||
| case Some(provider) => | ||
| // Get the table properties during creation and append the path option | ||
| // to the properties. | ||
| val tableProperties = table.properties | ||
| val properties = tableProperties ++ | ||
| table.storage.locationUri.map("path" -> CatalogUtils.URIToString(_)) | ||
| val dsOptions = new CaseInsensitiveStringMap(properties.asJava) | ||
|
|
||
| provider match { | ||
| case _: SupportsCatalogOptions => | ||
| // Currently, loadTable cannot support V2 provider that implements the | ||
| // SupportsCatalogOptions. Keep the behavior as before. | ||
| V1Table(table) | ||
|
|
||
| case _ => | ||
| // If the source accepts external table metadata, we can pass the schema and | ||
| // partitioning information stored in Hive to `getTable` to avoid expensive | ||
| // schema/partitioning inference. | ||
| if (provider.supportsExternalMetadata()) { | ||
| provider.getTable( | ||
| table.schema, | ||
| getV2Partitioning(table), | ||
| dsOptions.asCaseSensitiveMap()) | ||
| } else { | ||
| provider.getTable( | ||
| provider.inferSchema(dsOptions), | ||
| provider.inferPartitioning(dsOptions), | ||
| dsOptions.asCaseSensitiveMap()) | ||
| } | ||
| } | ||
| case _ => | ||
| V1Table(table) | ||
| } | ||
| } else { | ||
| V1Table(table) | ||
| } | ||
| } catch { | ||
| case _: NoSuchDatabaseException => | ||
| throw QueryCompilationErrors.noSuchTableError(ident) | ||
|
|
@@ -96,6 +135,16 @@ class V2SessionCatalog(catalog: SessionCatalog) | |
| throw QueryCompilationErrors.timeTravelUnsupportedError(toSQLId(nameParts)) | ||
| } | ||
|
|
||
| private def getV2Partitioning(table: CatalogTable): Array[Transform] = { | ||
| import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ | ||
| val v2Partitioning = table.partitionColumnNames.asTransforms | ||
| val v2Bucketing = table.bucketSpec.map( | ||
| spec => Array(spec.asTransform)).getOrElse(Array.empty) | ||
| val v2Clustering = table.clusterBySpec.map( | ||
| spec => Array(spec.asTransform)).getOrElse(Array.empty) | ||
| v2Partitioning ++ v2Bucketing ++ v2Clustering | ||
| } | ||
|
|
||
| override def invalidateTable(ident: Identifier): Unit = { | ||
| catalog.refreshTable(ident.asTableIdentifier) | ||
| } | ||
|
|
@@ -115,9 +164,44 @@ class V2SessionCatalog(catalog: SessionCatalog) | |
| partitions: Array[Transform], | ||
| properties: util.Map[String, String]): Table = { | ||
| import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.TransformHelper | ||
| val (partitionColumns, maybeBucketSpec, maybeClusterBySpec) = | ||
| partitions.toImmutableArraySeq.convertTransforms | ||
| val provider = properties.getOrDefault(TableCatalog.PROP_PROVIDER, conf.defaultDataSourceName) | ||
|
|
||
| val (newSchema, newPartitions) = DataSourceV2Utils.getTableProvider(provider, conf) match { | ||
| // If the provider does not support external metadata, users should not be allowed to | ||
| // specify custom schema when creating the data source table, since the schema will not | ||
| // be used when loading the table. | ||
| case Some(p) if !p.supportsExternalMetadata() => | ||
| if (schema.nonEmpty) { | ||
| throw new SparkUnsupportedOperationException( | ||
| errorClass = "CANNOT_CREATE_DATA_SOURCE_V2_TABLE.EXTERNAL_METADATA_UNSUPPORTED", | ||
| messageParameters = Map("provider" -> provider)) | ||
| } | ||
| // V2CreateTablePlan does not allow non-empty partitions when schema is empty. This | ||
| // is checked in `PreProcessTableCreation` rule. | ||
| assert(partitions.isEmpty, | ||
| s"Partitions should be empty when the schema is empty: ${partitions.mkString(", ")}") | ||
| (schema, partitions) | ||
|
|
||
| case Some(tableProvider) => | ||
| assert(tableProvider.supportsExternalMetadata()) | ||
| lazy val dsOptions = new CaseInsensitiveStringMap(properties) | ||
|
||
| if (schema.isEmpty) { | ||
| assert(partitions.isEmpty, | ||
| s"Partitions should be empty when the schema is empty: ${partitions.mkString(", ")}") | ||
| // Infer the schema and partitions and store them in the catalog. | ||
| (tableProvider.inferSchema(dsOptions), tableProvider.inferPartitioning(dsOptions)) | ||
| } else if (partitions.isEmpty) { | ||
| (schema, tableProvider.inferPartitioning(dsOptions)) | ||
| } else { | ||
| (schema, partitions) | ||
| } | ||
|
|
||
| case _ => | ||
| (schema, partitions) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we fail here if it's not a valid data source?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we can do it latter. It's the current behavior that allows any table provider. |
||
| } | ||
|
|
||
| val (partitionColumns, maybeBucketSpec, maybeClusterBySpec) = | ||
| newPartitions.toImmutableArraySeq.convertTransforms | ||
| val tableProperties = properties.asScala | ||
| val location = Option(properties.get(TableCatalog.PROP_LOCATION)) | ||
| val storage = DataSource.buildStorageFormatFromOptions(toOptions(tableProperties.toMap)) | ||
|
|
@@ -133,13 +217,13 @@ class V2SessionCatalog(catalog: SessionCatalog) | |
| identifier = ident.asTableIdentifier, | ||
| tableType = tableType, | ||
| storage = storage, | ||
| schema = schema, | ||
| schema = newSchema, | ||
| provider = Some(provider), | ||
| partitionColumnNames = partitionColumns, | ||
| bucketSpec = maybeBucketSpec, | ||
| properties = tableProperties.toMap ++ | ||
| maybeClusterBySpec.map( | ||
| clusterBySpec => ClusterBySpec.toProperty(schema, clusterBySpec, conf.resolver)), | ||
| clusterBySpec => ClusterBySpec.toProperty(newSchema, clusterBySpec, conf.resolver)), | ||
| tracksPartitionsInCatalog = conf.manageFilesourcePartitions, | ||
| comment = Option(properties.get(TableCatalog.PROP_COMMENT))) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,7 @@ import org.apache.spark.sql.connector.catalog.{Column => ColumnV2, _} | |
| import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME | ||
| import org.apache.spark.sql.connector.catalog.CatalogV2Util.withDefaultOwnership | ||
| import org.apache.spark.sql.connector.expressions.LiteralValue | ||
| import org.apache.spark.sql.connector.read.{InputPartition, ScanBuilder} | ||
| import org.apache.spark.sql.errors.QueryErrorsBase | ||
| import org.apache.spark.sql.execution.FilterExec | ||
| import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper | ||
|
|
@@ -1120,7 +1121,7 @@ class DataSourceV2SQLSuiteV1Filter | |
| }, | ||
| errorClass = "INSERT_COLUMN_ARITY_MISMATCH.NOT_ENOUGH_DATA_COLUMNS", | ||
| parameters = Map( | ||
| "tableName" -> "`default`.`tbl`", | ||
| "tableName" -> "`spark_catalog`.`default`.`tbl`", | ||
| "tableColumns" -> "`id`, `data`", | ||
| "dataColumns" -> "`col1`" | ||
| ) | ||
|
|
@@ -1155,7 +1156,7 @@ class DataSourceV2SQLSuiteV1Filter | |
| }, | ||
| errorClass = "INSERT_COLUMN_ARITY_MISMATCH.NOT_ENOUGH_DATA_COLUMNS", | ||
| parameters = Map( | ||
| "tableName" -> "`default`.`tbl`", | ||
| "tableName" -> "`spark_catalog`.`default`.`tbl`", | ||
| "tableColumns" -> "`id`, `data`", | ||
| "dataColumns" -> "`col1`" | ||
| ) | ||
|
|
@@ -1191,7 +1192,7 @@ class DataSourceV2SQLSuiteV1Filter | |
| }, | ||
| errorClass = "INSERT_COLUMN_ARITY_MISMATCH.NOT_ENOUGH_DATA_COLUMNS", | ||
| parameters = Map( | ||
| "tableName" -> "`default`.`tbl`", | ||
| "tableName" -> "`spark_catalog`.`default`.`tbl`", | ||
| "tableColumns" -> "`id`, `data`, `data2`", | ||
| "dataColumns" -> "`col1`, `col2`" | ||
| ) | ||
|
|
@@ -3366,8 +3367,21 @@ class DataSourceV2SQLSuiteV2Filter extends DataSourceV2SQLSuite { | |
|
|
||
| /** Used as a V2 DataSource for V2SessionCatalog DDL */ | ||
| class FakeV2Provider extends SimpleTableProvider { | ||
|
||
|
|
||
| override def supportsExternalMetadata(): Boolean = true | ||
|
|
||
| class MyScanBuilder extends SimpleScanBuilder { | ||
| override def planInputPartitions(): Array[InputPartition] = { | ||
| Array(RangeInputPartition(0, 5), RangeInputPartition(5, 10)) | ||
| } | ||
| } | ||
|
|
||
| override def getTable(options: CaseInsensitiveStringMap): Table = { | ||
| throw new UnsupportedOperationException("Unnecessary for DDL tests") | ||
| new SimpleBatchTable { | ||
| override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { | ||
| new MyScanBuilder() | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't find other errors that mention data source v2. I think it's a developer thing and we should not expose it to end users via error message. How about just
CANNOT_CREATE_TABLE?