Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ jobs:
sparkProfile: "spark3.2"
flinkProfile: "flink1.14"

- scalaProfile: "scala-2.12"
sparkProfile: "spark3.3"
flinkProfile: "flink1.14"

steps:
- uses: actions/checkout@v2
- name: Set up JDK 8
Expand All @@ -56,7 +60,6 @@ jobs:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
FLINK_PROFILE: ${{ matrix.flinkProfile }}
if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 before hadoop upgrade to 3.x
run:
mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -D"$FLINK_PROFILE" -DfailIfNoTests=false -pl hudi-examples/hudi-examples-flink,hudi-examples/hudi-examples-java,hudi-examples/hudi-examples-spark
- name: Spark SQL Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ public class HoodieStorageConfig extends HoodieConfig {
.defaultValue("TIMESTAMP_MICROS")
.withDocumentation("Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files.");

// SPARK-38094 Spark 3.3 checks if this field is enabled. Hudi has to provide this or there would be NPE thrown
// Would ONLY be effective with Spark 3.3+
// default value is true which is in accordance with Spark 3.3
public static final ConfigProperty<String> PARQUET_FIELD_ID_WRITE_ENABLED = ConfigProperty
.key("hoodie.parquet.field_id.write.enabled")
.defaultValue("true")
.sinceVersion("0.12.0")
.withDocumentation("Would only be effective with Spark 3.3+. Sets spark.sql.parquet.fieldId.write.enabled. "
+ "If enabled, Spark will write out parquet native field ids that are stored inside StructField's metadata as parquet.field.id to parquet files.");

public static final ConfigProperty<String> HFILE_COMPRESSION_ALGORITHM_NAME = ConfigProperty
.key("hoodie.hfile.compression.algorithm")
.defaultValue("GZ")
Expand Down Expand Up @@ -337,6 +347,11 @@ public Builder parquetOutputTimestampType(String parquetOutputTimestampType) {
return this;
}

public Builder parquetFieldIdWrite(String parquetFieldIdWrite) {
storageConfig.setValue(PARQUET_FIELD_ID_WRITE_ENABLED, parquetFieldIdWrite);
return this;
}

public Builder hfileCompressionAlgorithm(String hfileCompressionAlgorithm) {
storageConfig.setValue(HFILE_COMPRESSION_ALGORITHM_NAME, hfileCompressionAlgorithm);
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1677,6 +1677,10 @@ public String parquetOutputTimestampType() {
return getString(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE);
}

public String parquetFieldIdWriteEnabled() {
return getString(HoodieStorageConfig.PARQUET_FIELD_ID_WRITE_ENABLED);
}

public Option<HoodieLogBlock.HoodieLogBlockType> getLogDataBlockFormat() {
return Option.ofNullable(getString(HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT))
.map(HoodieLogBlock.HoodieLogBlockType::fromId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.hudi.client.bootstrap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieFileStatus;
Expand Down Expand Up @@ -71,11 +72,20 @@ protected Schema getBootstrapSourceSchema(HoodieEngineContext context, List<Pair
}

private static Schema getBootstrapSourceSchemaParquet(HoodieWriteConfig writeConfig, HoodieEngineContext context, Path filePath) {
MessageType parquetSchema = new ParquetUtils().readSchema(context.getHadoopConf().get(), filePath);
Configuration hadoopConf = context.getHadoopConf().get();
MessageType parquetSchema = new ParquetUtils().readSchema(hadoopConf, filePath);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change made according to SPARK-36935. ParquetSchemaConverter change

hadoopConf.set(
SQLConf.PARQUET_BINARY_AS_STRING().key(),
SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString());
hadoopConf.set(
SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(),
SQLConf.PARQUET_INT96_AS_TIMESTAMP().defaultValueString());
hadoopConf.set(
SQLConf.CASE_SENSITIVE().key(),
SQLConf.CASE_SENSITIVE().defaultValueString());
ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter(hadoopConf);

ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter(
Boolean.parseBoolean(SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString()),
Boolean.parseBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP().defaultValueString()));
StructType sparkSchema = converter.convert(parquetSchema);
String tableName = HoodieAvroUtils.sanitizeName(writeConfig.getTableName());
String structName = tableName + "_record";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, O
Configuration hadoopConf = new Configuration(conf);
hadoopConf.set("spark.sql.parquet.writeLegacyFormat", writeConfig.parquetWriteLegacyFormatEnabled());
hadoopConf.set("spark.sql.parquet.outputTimestampType", writeConfig.parquetOutputTimestampType());
hadoopConf.set("spark.sql.parquet.fieldId.write.enabled", writeConfig.parquetFieldIdWriteEnabled());
this.hadoopConf = hadoopConf;
setSchema(structType, hadoopConf);
this.bloomFilter = bloomFilterOpt.orElse(null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,13 @@ private[hudi] trait SparkVersionsSupport {
def isSpark3_0: Boolean = getSparkVersion.startsWith("3.0")
def isSpark3_1: Boolean = getSparkVersion.startsWith("3.1")
def isSpark3_2: Boolean = getSparkVersion.startsWith("3.2")
def isSpark3_3: Boolean = getSparkVersion.startsWith("3.3")

def gteqSpark3_1: Boolean = getSparkVersion >= "3.1"
def gteqSpark3_1_3: Boolean = getSparkVersion >= "3.1.3"
def gteqSpark3_2: Boolean = getSparkVersion >= "3.2"
def gteqSpark3_2_1: Boolean = getSparkVersion >= "3.2.1"
def gteqSpark3_3: Boolean = getSparkVersion >= "3.3"
}

object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ import org.apache.spark.sql.hudi.SparkAdapter
trait SparkAdapterSupport {

lazy val sparkAdapter: SparkAdapter = {
val adapterClass = if (HoodieSparkUtils.isSpark3_2) {
val adapterClass = if (HoodieSparkUtils.isSpark3_3) {
"org.apache.spark.sql.adapter.Spark3_3Adapter"
} else if (HoodieSparkUtils.isSpark3_2) {
"org.apache.spark.sql.adapter.Spark3_2Adapter"
} else if (HoodieSparkUtils.isSpark3_0 || HoodieSparkUtils.isSpark3_1) {
"org.apache.spark.sql.adapter.Spark3_1Adapter"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,15 @@ import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSchemaConver
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedPredicate}
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, InterpretedPredicate}
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.JoinType
import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
import org.apache.spark.sql.execution.datasources.{FilePartition, LogicalRelation, PartitionedFile, SparkParsePartitionUtil}
import org.apache.spark.sql.execution.datasources.{FilePartition, FileScanRDD, LogicalRelation, PartitionedFile, SparkParsePartitionUtil}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{HoodieCatalystExpressionUtils, HoodieCatalystPlansUtils, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

Expand Down Expand Up @@ -132,8 +130,8 @@ trait SparkAdapter extends Serializable {
}

/**
* Create instance of [[ParquetFileFormat]]
*/
* Create instance of [[ParquetFileFormat]]
*/
def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat]

/**
Expand All @@ -143,6 +141,38 @@ trait SparkAdapter extends Serializable {
*/
def createInterpretedPredicate(e: Expression): InterpretedPredicate

/**
* Create instance of [[HoodieFileScanRDD]]
* SPARK-37273 FileScanRDD constructor changed in SPARK 3.3
*/
def createHoodieFileScanRDD(sparkSession: SparkSession,
readFunction: PartitionedFile => Iterator[InternalRow],
filePartitions: Seq[FilePartition],
readDataSchema: StructType,
metadataColumns: Seq[AttributeReference] = Seq.empty): FileScanRDD

/**
* Resolve [[DeleteFromTable]]
* SPARK-38626 condition is no longer Option in Spark 3.3
*/
def resolveDeleteFromTable(deleteFromTable: Command,
resolveExpression: Expression => Expression): LogicalPlan

/**
* Extract condition in [[DeleteFromTable]]
* SPARK-38626 condition is no longer Option in Spark 3.3
*/
def extractCondition(deleteFromTable: Command): Expression
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: rename to extractDeleteCondition?


/**
* Get parseQuery from ExtendedSqlParser, only for Spark 3.3+
*/
def getQueryParserFromExtendedSqlParser(session: SparkSession, delegate: ParserInterface,
sqlText: String): LogicalPlan = {
// unsupported by default
throw new UnsupportedOperationException(s"Unsupported parseQuery method in Spark earlier than Spark 3.3.0")
}

/**
* Converts instance of [[StorageLevel]] to a corresponding string
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.flink.types.Row;
import org.apache.hudi.common.model.HoodieTableType;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.io.TempDir;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
Expand All @@ -45,6 +46,7 @@ void beforeEach() {
@TempDir
File tempFile;

@Disabled
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Temporarily disabled flink quickstart test due to it's instablity

@ParameterizedTest
@EnumSource(value = HoodieTableType.class)
void testHoodieFlinkQuickstart(HoodieTableType tableType) throws Exception {
Expand Down
6 changes: 6 additions & 0 deletions hudi-examples/hudi-examples-spark/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,12 @@
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>

<!-- Hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
</dependency>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SparkQuickStartTest would throw NoSuchMethodError without this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good find. so can we now re-enable spark 3.2 quickstart test in GH action? check out bot.yml

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Highly likely. maybe we need a seperate jira to track that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

<!-- Parquet -->
<dependency>
<groupId>org.apache.parquet</groupId>
Expand Down
17 changes: 12 additions & 5 deletions hudi-spark-datasource/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,19 @@ This repo contains the code that integrate Hudi with Spark. The repo is split in

`hudi-spark`
`hudi-spark2`
`hudi-spark3`
`hudi-spark3.1.x`
`hudi-spark3.2.x`
`hudi-spark3.3.x`
`hudi-spark2-common`
`hudi-spark3-common`
`hudi-spark-common`

* hudi-spark is the module that contains the code that both spark2 & spark3 version would share, also contains the antlr4
file that supports spark sql on spark 2.x version.
* hudi-spark2 is the module that contains the code that compatible with spark 2.x versions.
* hudi-spark3 is the module that contains the code that compatible with spark 3.2.0(and above) versions。
* hudi-spark3.1.x is the module that contains the code that compatible with spark3.1.x and spark3.0.x version.
* hudi-spark3.1.x is the module that contains the code that compatible with spark3.1.x and spark3.0.x version.
* hudi-spark3.2.x is the module that contains the code that compatible with spark 3.2.x versions.
* hudi-spark3.3.x is the module that contains the code that compatible with spark 3.3.x+ versions.
* hudi-spark2-common is the module that contains the code that would be reused between spark2.x versions, right now the module
has no class since hudi only supports spark 2.4.4 version, and it acts as the placeholder when packaging hudi-spark-bundle module.
* hudi-spark3-common is the module that contains the code that would be reused between spark3.x versions.
Expand All @@ -50,7 +52,12 @@ has no class since hudi only supports spark 2.4.4 version, and it acts as the pl
| 3.1.2 | No |
| 3.2.0 | Yes |

### About upgrading Time Travel:
### To improve:
Spark3.3 support time travel syntax link [SPARK-37219](https://issues.apache.org/jira/browse/SPARK-37219).
Once Spark 3.3 released. The files in the following list will be removed:
* hudi-spark3's `HoodieSpark3_2ExtendedSqlAstBuilder.scala`、`HoodieSpark3_2ExtendedSqlParser.scala`、`TimeTravelRelation.scala`、`SqlBase.g4`、`HoodieSqlBase.g4`
* hudi-spark3.3.x's `HoodieSpark3_3ExtendedSqlAstBuilder.scala`, `HoodieSpark3_3ExtendedSqlParser.scala`, `TimeTravelRelation.scala`, `SqlBase.g4`, `HoodieSqlBase.g4`
Tracking Jira: [HUDI-4468](https://issues.apache.org/jira/browse/HUDI-4468)

Some other improvements undergoing:
* Port borrowed classes from Spark 3.3 [HUDI-4467](https://issues.apache.org/jira/browse/HUDI-4467)

Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
globPaths: Seq[Path])
extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) with SparkAdapterSupport {

case class HoodieBaseFileSplit(filePartition: FilePartition) extends HoodieFileSplit

override type FileSplit = HoodieBaseFileSplit

// TODO(HUDI-3204) this is to override behavior (exclusively) for COW tables to always extract
Expand Down Expand Up @@ -97,7 +99,9 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
// back into the one expected by the caller
val projectedReader = projectReader(baseFileReader, requiredSchema.structTypeSchema)

new HoodieFileScanRDD(sparkSession, projectedReader.apply, fileSplits)
// SPARK-37273 FileScanRDD constructor changed in SPARK 3.3
sparkAdapter.createHoodieFileScanRDD(sparkSession, projectedReader.apply, fileSplits.map(_.filePartition), requiredSchema.structTypeSchema)
.asInstanceOf[HoodieUnsafeRDD]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FileScanRDD API changed: SPARK-37273 have to split HoodieFileScanRDD for different spark versions

}

protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[HoodieBaseFileSplit] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,24 @@ import scala.collection.mutable.ListBuffer
object HoodieAnalysis {
type RuleBuilder = SparkSession => Rule[LogicalPlan]

def customOptimizerRules: Seq[RuleBuilder] =
def customOptimizerRules: Seq[RuleBuilder] = {
if (HoodieSparkUtils.gteqSpark3_1) {
val nestedSchemaPruningClass = "org.apache.spark.sql.execution.datasources.NestedSchemaPruning"
val nestedSchemaPruningRule = ReflectionUtils.loadClass(nestedSchemaPruningClass).asInstanceOf[Rule[LogicalPlan]]
val nestedSchemaPruningClass =
if (HoodieSparkUtils.gteqSpark3_3) {
"org.apache.spark.sql.execution.datasources.Spark33NestedSchemaPruning"
} else if (HoodieSparkUtils.gteqSpark3_2) {
"org.apache.spark.sql.execution.datasources.Spark32NestedSchemaPruning"
} else {
// spark 3.1
"org.apache.spark.sql.execution.datasources.Spark31NestedSchemaPruning"
}

val nestedSchemaPruningRule = ReflectionUtils.loadClass(nestedSchemaPruningClass).asInstanceOf[Rule[LogicalPlan]]
Seq(_ => nestedSchemaPruningRule)
} else {
Seq.empty
}
}

def customResolutionRules: Seq[RuleBuilder] = {
val rules: ListBuffer[RuleBuilder] = ListBuffer(
Expand All @@ -74,18 +83,21 @@ object HoodieAnalysis {
val spark3ResolveReferences: RuleBuilder =
session => ReflectionUtils.loadClass(spark3ResolveReferencesClass, session).asInstanceOf[Rule[LogicalPlan]]

val spark32ResolveAlterTableCommandsClass = "org.apache.spark.sql.hudi.ResolveHudiAlterTableCommandSpark32"
val spark32ResolveAlterTableCommands: RuleBuilder =
session => ReflectionUtils.loadClass(spark32ResolveAlterTableCommandsClass, session).asInstanceOf[Rule[LogicalPlan]]
val resolveAlterTableCommandsClass =
if (HoodieSparkUtils.gteqSpark3_3)
"org.apache.spark.sql.hudi.Spark33ResolveHudiAlterTableCommand"
else "org.apache.spark.sql.hudi.Spark32ResolveHudiAlterTableCommand"
val resolveAlterTableCommands: RuleBuilder =
session => ReflectionUtils.loadClass(resolveAlterTableCommandsClass, session).asInstanceOf[Rule[LogicalPlan]]

// NOTE: PLEASE READ CAREFULLY
//
// It's critical for this rules to follow in this order, so that DataSource V2 to V1 fallback
// is performed prior to other rules being evaluated
rules ++= Seq(dataSourceV2ToV1Fallback, spark3Analysis, spark3ResolveReferences, spark32ResolveAlterTableCommands)
rules ++= Seq(dataSourceV2ToV1Fallback, spark3Analysis, spark3ResolveReferences, resolveAlterTableCommands)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SPARK-38939 DropColumns syntax change


} else if (HoodieSparkUtils.gteqSpark3_1) {
val spark31ResolveAlterTableCommandsClass = "org.apache.spark.sql.hudi.ResolveHudiAlterTableCommand312"
val spark31ResolveAlterTableCommandsClass = "org.apache.spark.sql.hudi.Spark312ResolveHudiAlterTableCommand"
val spark31ResolveAlterTableCommands: RuleBuilder =
session => ReflectionUtils.loadClass(spark31ResolveAlterTableCommandsClass, session).asInstanceOf[Rule[LogicalPlan]]

Expand Down Expand Up @@ -421,12 +433,10 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi
UpdateTable(table, resolvedAssignments, resolvedCondition)

// Resolve Delete Table
case DeleteFromTable(table, condition)
case dft @ DeleteFromTable(table, condition)
if sparkAdapter.isHoodieTable(table, sparkSession) && table.resolved =>
// Resolve condition
val resolvedCondition = condition.map(resolveExpressionFrom(table)(_))
// Return the resolved DeleteTable
DeleteFromTable(table, resolvedCondition)
val resolveExpression = resolveExpressionFrom(table, None)_
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest we keep syntax as it was (with parenthesis)

sparkAdapter.resolveDeleteFromTable(dft, resolveExpression)

// Append the meta field to the insert query to walk through the validate for the
// number of insert fields with the number of the target table fields.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.hudi.SparkAdapterSupport
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable
import org.apache.spark.sql.catalyst.plans.logical.DeleteFromTable
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._
import org.apache.spark.sql.hudi.ProvidesHoodieConfig

Expand All @@ -36,9 +37,9 @@ case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends Hoodie

// Remove meta fields from the data frame
var df = removeMetaFields(Dataset.ofRows(sparkSession, table))
if (deleteTable.condition.isDefined) {
df = df.filter(Column(deleteTable.condition.get))
}
// SPARK-38626 DeleteFromTable.condition is changed from Option[Expression] to Expression in Spark 3.3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: the comment can go into the Spark adapter implementation and is not necessary here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be addressed in a separate PR.

val condition = sparkAdapter.extractCondition(deleteTable)
if (condition != null) df = df.filter(Column(condition))

val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableId)
val config = buildHoodieDeleteTableConfig(hoodieCatalogTable, sparkSession)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ class HoodieCommonSqlParser(session: SparkSession, delegate: ParserInterface)

override def parseDataType(sqlText: String): DataType = delegate.parseDataType(sqlText)

/* SPARK-37266 Added parseQuery to ParserInterface in Spark 3.3.0. This is a patch to prevent
hackers from tampering text with persistent view, it won't be called in older Spark
Don't mark this as override for backward compatibility
Can't use sparkExtendedParser directly here due to the same reason */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, but i can't understand the java-doc: can you please elaborate on why this is here?

  • What exactly are we trying to prevent from happening?
  • What BWC are we referring to?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parseQuery is a new method of Spark trait ParserInterface. there would be compile issue If we call this method from any class that’s shared across different versions of spark, because older ParserInterface doesn’t have this method

Due to the same reason, we can't mark this method with override because for older spark there isn't parseQuery

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed on Slack, let's instead of doing parsing in SparkAdapter create ExtendedParserInterface, where we can place this new parseQuery method and that could be used in Hudi's code-base (this is similar to how HoodieCatalystExpressionUtils set up)

def parseQuery(sqlText: String): LogicalPlan = parse(sqlText) { parser =>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we doing double-parsing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reused the code flow from parsePlan method under the same class here. Calling parse might not be needed here. good point

sparkAdapter.getQueryParserFromExtendedSqlParser(session, delegate, sqlText)
}

def parseRawDataType(sqlText : String) : DataType = {
throw new UnsupportedOperationException(s"Unsupported parseRawDataType method")
}
Expand Down
Loading