-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-3047] Basic Implementation of Spark Datasource V2 #4350
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e9d8eaf
b08c502
958af8c
a3aae4e
ff026f4
b96b821
dc2f8a9
3d947ae
54b205a
6e90594
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,16 +20,19 @@ package org.apache.spark.sql.hudi | |
|
|
||
| import org.apache.hudi.client.utils.SparkRowSerDe | ||
| import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTable | ||
| import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder | ||
| import org.apache.spark.sql.catalyst.expressions.Expression | ||
| import org.apache.spark.sql.catalyst.parser.ParserInterface | ||
| import org.apache.spark.sql.catalyst.plans.JoinType | ||
| import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan, SubqueryAlias} | ||
| import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} | ||
| import org.apache.spark.sql.execution.datasources.SparkParsePartitionUtil | ||
| import org.apache.spark.sql.execution.datasources.{LogicalRelation, SparkParsePartitionUtil} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.{Row, SparkSession} | ||
|
|
||
| import java.util.Locale | ||
|
|
||
| /** | ||
| * An interface to adapter the difference between spark2 and spark3 | ||
| * in some spark related class. | ||
|
|
@@ -44,12 +47,12 @@ trait SparkAdapter extends Serializable { | |
| /** | ||
| * Convert a AliasIdentifier to TableIdentifier. | ||
| */ | ||
| def toTableIdentify(aliasId: AliasIdentifier): TableIdentifier | ||
| def toTableIdentifier(aliasId: AliasIdentifier): TableIdentifier | ||
|
|
||
| /** | ||
| * Convert a UnresolvedRelation to TableIdentifier. | ||
| */ | ||
| def toTableIdentify(relation: UnresolvedRelation): TableIdentifier | ||
| def toTableIdentifier(relation: UnresolvedRelation): TableIdentifier | ||
|
|
||
| /** | ||
| * Create Join logical plan. | ||
|
|
@@ -92,4 +95,31 @@ trait SparkAdapter extends Serializable { | |
| * ParserInterface#parseMultipartIdentifier is supported since spark3, for spark2 this should not be called. | ||
| */ | ||
| def parseMultipartIdentifier(parser: ParserInterface, sqlText: String): Seq[String] | ||
|
|
||
| def isHoodieTable(table: LogicalPlan, spark: SparkSession): Boolean = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any difference with hoodieSqlCommonUtils.isHoodieTable? I see sometimes we use adapter.isHoodieTable, sometimes use hoodieSqlCommonUtils.isHoodieTable
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
in fact hoodieSqlCommonUtils.isHoodieTable method is used in v1 codebase to judge if a table is a hoodie table in v1 codebase , but adapter.isHoodieTable method is to judge if a table is a hoodie table in v2 codebase, change the name would be better to understand.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your reply, and please correct me if I'm wrong. Can we just move the method implementation out from SparkAdapter to Spark2Adapter, Spark2Adapter can judge if a table is a hoodie table in v1 codebase, while Spark3Adapter can judge it by v2 codebase, by this, I think the method hoodieSqlCommonUtils.isHoodieTable can be simply removed? Looking forward this PR can be merged as soon as possible, this is an excellent work as we can support many other features with DSV2 based on this. :) |
||
| tripAlias(table) match { | ||
| case LogicalRelation(_, _, Some(tbl), _) => isHoodieTable(tbl) | ||
| case relation: UnresolvedRelation => | ||
| isHoodieTable(toTableIdentifier(relation), spark) | ||
| case _=> false | ||
| } | ||
| } | ||
|
|
||
| def tripAlias(plan: LogicalPlan): LogicalPlan = { | ||
| plan match { | ||
| case SubqueryAlias(_, relation: LogicalPlan) => | ||
| tripAlias(relation) | ||
| case other => | ||
| other | ||
| } | ||
| } | ||
|
|
||
| def isHoodieTable(table: CatalogTable): Boolean = { | ||
| table.provider.map(_.toLowerCase(Locale.ROOT)).orNull == "hudi" | ||
| } | ||
|
|
||
| def isHoodieTable(tableId: TableIdentifier, spark: SparkSession): Boolean = { | ||
| val table = spark.sessionState.catalog.getTableMetadata(tableId) | ||
| isHoodieTable(table) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -177,7 +177,7 @@ class DefaultSource extends RelationProvider | |
| outputMode) | ||
| } | ||
|
|
||
| override def shortName(): String = "hudi" | ||
| override def shortName(): String = "hudi_v1" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this would cause every job out there to be upgraded? Not sure if we can afford to do this. Also would like to clearly understand if the new v2 implementation will support ALL the existing functionality or be a drop-in replacement for the current v1 implementation? I think its crucial to get aligned on this before we proceed further. |
||
|
|
||
| private def getBaseFileOnlyView(useHoodieFileIndex: Boolean, | ||
| sqlContext: SQLContext, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@xushiyan I find it very hard to make it compatible between spark 3.2.0 and spark 3.0.x/spark3.1.x(there is no V1Write for spark 3.0.x and 3.1.x ) after we upgrade spark version to spark 3.2.0, so I commented out the workflow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@leesf We introduce build profiles
spark3.0.xandspark3.1.xmostly due to spark's own incompatibilities. Here I think we can make some rules: to enable v2 writer, users have to make sure they're on spark 3.2+. Sounds good? In future, we may gradually drop support for old spark versions if the old spark code deviates too far from the latest one.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sounds good.