-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-6212] Hudi Spark 3.0.x integration #8714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
28ac561
9210c4c
b3da8cc
d3f558c
478bc00
8dbee82
b439566
0c8facb
a315e98
172e9f2
cd96d89
878b85c
7b4f534
ceae94d
929a508
722fae1
4abc2e2
d6b11cb
c4a5675
f284403
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,16 +17,21 @@ | |
|
|
||
| package org.apache.spark.sql | ||
|
|
||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, Predicate, PredicateHelper} | ||
| import org.apache.spark.sql.execution.datasources.DataSourceStrategy | ||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression} | ||
| abstract class HoodieSpark3CatalystExpressionUtils extends HoodieCatalystExpressionUtils { | ||
yihua marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| trait HoodieSpark3CatalystExpressionUtils extends HoodieCatalystExpressionUtils | ||
| with PredicateHelper { | ||
| /** | ||
| * The attribute name may differ from the one in the schema if the query analyzer | ||
| * is case insensitive. We should change attribute names to match the ones in the schema, | ||
| * so we do not need to worry about case sensitivity anymore | ||
| */ | ||
| def normalizeExprs(exprs: Seq[Expression], attributes: Seq[Attribute]): Seq[Expression] | ||
|
|
||
| override def normalizeExprs(exprs: Seq[Expression], attributes: Seq[Attribute]): Seq[Expression] = | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| DataSourceStrategy.normalizeExprs(exprs, attributes) | ||
|
|
||
| override def extractPredicatesWithinOutputSet(condition: Expression, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| outputSet: AttributeSet): Option[Expression] = | ||
| super[PredicateHelper].extractPredicatesWithinOutputSet(condition, outputSet) | ||
| /** | ||
| * Returns a filter that its reference is a subset of `outputSet` and it contains the maximum | ||
| * constraints from `condition`. This is used for predicate push-down | ||
| * When there is no such filter, `None` is returned. | ||
| */ | ||
| def extractPredicatesWithinOutputSet(condition: Expression, | ||
| outputSet: AttributeSet): Option[Expression] | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,22 +25,15 @@ import org.apache.hudi.spark3.internal.ReflectUtil | |
| import org.apache.hudi.{AvroConversionUtils, DefaultSource, Spark3RowSerDe} | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.avro.{HoodieAvroSchemaConverters, HoodieSparkAvroSchemaConverters} | ||
| import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTable | ||
| import org.apache.spark.sql.catalyst.encoders.RowEncoder | ||
| import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedPredicate, Predicate} | ||
| import org.apache.spark.sql.catalyst.planning.PhysicalOperation | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.catalyst.util.DateFormatter | ||
| import org.apache.spark.sql.connector.catalog.V2TableWithV1Fallback | ||
| import org.apache.spark.sql.execution.datasources._ | ||
| import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation | ||
| import org.apache.spark.sql.hudi.SparkAdapter | ||
| import org.apache.spark.sql.sources.BaseRelation | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.sql.{HoodieSpark3CatalogUtils, SQLContext, SparkSession} | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.storage.StorageLevel | ||
| import org.apache.spark.storage.StorageLevel._ | ||
|
|
||
| import java.time.ZoneId | ||
| import java.util.TimeZone | ||
|
|
@@ -63,20 +56,6 @@ abstract class BaseSpark3Adapter extends SparkAdapter with Logging { | |
| new Spark3RowSerDe(encoder) | ||
| } | ||
|
|
||
| override def resolveHoodieTable(plan: LogicalPlan): Option[CatalogTable] = { | ||
| super.resolveHoodieTable(plan).orElse { | ||
| EliminateSubqueryAliases(plan) match { | ||
| // First, we need to weed out unresolved plans | ||
| case plan if !plan.resolved => None | ||
| // NOTE: When resolving Hudi table we allow [[Filter]]s and [[Project]]s be applied | ||
| // on top of it | ||
| case PhysicalOperation(_, _, DataSourceV2Relation(v2: V2TableWithV1Fallback, _, _, _, _)) if isHoodieTable(v2.v1Table) => | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| Some(v2.v1Table) | ||
| case _ => None | ||
| } | ||
| } | ||
| } | ||
|
|
||
| override def getAvroSchemaConverters: HoodieAvroSchemaConverters = HoodieSparkAvroSchemaConverters | ||
|
|
||
| override def getSparkParsePartitionUtil: SparkParsePartitionUtil = Spark3ParsePartitionUtil | ||
|
|
@@ -108,23 +87,5 @@ abstract class BaseSpark3Adapter extends SparkAdapter with Logging { | |
| DefaultSource.createRelation(sqlContext, metaClient, dataSchema, globPaths, parameters.asScala.toMap) | ||
| } | ||
|
|
||
| /** | ||
| * Converts instance of [[StorageLevel]] to a corresponding string | ||
| */ | ||
| override def convertStorageLevelToString(level: StorageLevel): String = level match { | ||
| case NONE => "NONE" | ||
| case DISK_ONLY => "DISK_ONLY" | ||
| case DISK_ONLY_2 => "DISK_ONLY_2" | ||
| case DISK_ONLY_3 => "DISK_ONLY_3" | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| case MEMORY_ONLY => "MEMORY_ONLY" | ||
| case MEMORY_ONLY_2 => "MEMORY_ONLY_2" | ||
| case MEMORY_ONLY_SER => "MEMORY_ONLY_SER" | ||
| case MEMORY_ONLY_SER_2 => "MEMORY_ONLY_SER_2" | ||
| case MEMORY_AND_DISK => "MEMORY_AND_DISK" | ||
| case MEMORY_AND_DISK_2 => "MEMORY_AND_DISK_2" | ||
| case MEMORY_AND_DISK_SER => "MEMORY_AND_DISK_SER" | ||
| case MEMORY_AND_DISK_SER_2 => "MEMORY_AND_DISK_SER_2" | ||
| case OFF_HEAP => "OFF_HEAP" | ||
| case _ => throw new IllegalArgumentException(s"Invalid StorageLevel: $level") | ||
| } | ||
| override def convertStorageLevelToString(level: StorageLevel): String | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.