diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml index dfc6c505c8a1d..29702846b3d2d 100644 --- a/.github/workflows/bot.yml +++ b/.github/workflows/bot.yml @@ -14,51 +14,26 @@ jobs: build: runs-on: ubuntu-latest strategy: - max-parallel: 8 matrix: include: - # Spark 2.4.4, scala 2.11 - scalaProfile: "scala-2.11" sparkProfile: "spark2.4" - sparkVersion: "2.4.4" flinkProfile: "flink1.13" - # Spark 2.4.4, scala 2.12 - - scalaProfile: "scala-2.12" + - scalaProfile: "scala-2.11" sparkProfile: "spark2.4" - sparkVersion: "2.4.4" flinkProfile: "flink1.14" - # Spark 3.1.x - - scalaProfile: "scala-2.12" - sparkProfile: "spark3.1" - sparkVersion: "3.1.0" - flinkProfile: "flink1.13" - - scalaProfile: "scala-2.12" - sparkProfile: "spark3.1" - sparkVersion: "3.1.1" + sparkProfile: "spark2.4" flinkProfile: "flink1.13" - scalaProfile: "scala-2.12" sparkProfile: "spark3.1" - sparkVersion: "3.1.2" flinkProfile: "flink1.14" - - scalaProfile: "scala-2.12" - sparkProfile: "spark3.1" - sparkVersion: "3.1.3" - flinkProfile: "flink1.14" - - # Spark 3.2.x - - scalaProfile: "scala-2.12" - sparkProfile: "spark3.2" - sparkVersion: "3.2.0" - flinkProfile: "flink1.13" - - scalaProfile: "scala-2.12" sparkProfile: "spark3.2" - sparkVersion: "3.2.1" flinkProfile: "flink1.14" steps: @@ -73,16 +48,14 @@ jobs: env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} - SPARK_VERSION: ${{ matrix.sparkVersion }} FLINK_PROFILE: ${{ matrix.flinkProfile }} run: - mvn clean install -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -D"$FLINK_PROFILE" -Dspark.version="$SPARK_VERSION" -Pintegration-tests -DskipTests=true -B -V + mvn clean install -Pintegration-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -D"$FLINK_PROFILE" -DskipTests=true -B -V - name: Quickstart Test env: SCALA_PROFILE: ${{ matrix.scalaProfile }} SPARK_PROFILE: ${{ matrix.sparkProfile }} - SPARK_VERSION: ${{ matrix.sparkVersion }} FLINK_PROFILE: ${{ matrix.flinkProfile }} - if: ${{ !startsWith(env.SPARK_VERSION, '3.2.') }} # skip test spark 3.2 before hadoop upgrade to 3.x + if: ${{ !endsWith(env.SPARK_PROFILE, '3.2') }} # skip test spark 3.2 before hadoop upgrade to 3.x run: - mvn test -P "unit-tests" -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -D"$FLINK_PROFILE" -Dspark.version="$SPARK_VERSION" -DfailIfNoTests=false -pl hudi-examples/hudi-examples-flink,hudi-examples/hudi-examples-java,hudi-examples/hudi-examples-spark + mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -D"$FLINK_PROFILE" -DfailIfNoTests=false -pl hudi-examples/hudi-examples-flink,hudi-examples/hudi-examples-java,hudi-examples/hudi-examples-spark diff --git a/README.md b/README.md index e646463bac992..07a2201c96693 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,11 @@ --> # Apache Hudi -Apache Hudi (pronounced Hoodie) stands for `Hadoop Upserts Deletes and Incrementals`. -Hudi manages the storage of large analytical datasets on DFS (Cloud stores, HDFS or any Hadoop FileSystem compatible storage). + +Apache Hudi (pronounced Hoodie) stands for `Hadoop Upserts Deletes and Incrementals`. Hudi manages the storage of large +analytical datasets on DFS (Cloud stores, HDFS or any Hadoop FileSystem compatible storage). + +Hudi logo @@ -25,12 +28,15 @@ Hudi manages the storage of large analytical datasets on DFS (Cloud stores, HDFS [![Test](https://dev.azure.com/apache-hudi-ci-org/apache-hudi-ci/_apis/build/status/apachehudi-ci.hudi-mirror?branchName=master)](https://dev.azure.com/apache-hudi-ci-org/apache-hudi-ci/_build/latest?definitionId=3&branchName=master) [![License](https://img.shields.io/badge/license-Apache%202-4EB1BA.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.hudi/hudi/badge.svg)](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.hudi%22) +![GitHub commit activity](https://img.shields.io/github/commit-activity/m/apache/hudi) [![Join on Slack](https://img.shields.io/badge/slack-%23hudi-72eff8?logo=slack&color=48c628&label=Join%20on%20Slack)](https://join.slack.com/t/apache-hudi/shared_invite/enQtODYyNDAxNzc5MTg2LTE5OTBlYmVhYjM0N2ZhOTJjOWM4YzBmMWU2MjZjMGE4NDc5ZDFiOGQ2N2VkYTVkNzU3ZDQ4OTI1NmFmYWQ0NzE) +![Twitter Follow](https://img.shields.io/twitter/follow/ApacheHudi) ## Features + * Upsert support with fast, pluggable indexing * Atomically publish data with rollback support -* Snapshot isolation between writer & queries +* Snapshot isolation between writer & queries * Savepoints for data recovery * Manages file sizes, layout using statistics * Async compaction of row & columnar data @@ -64,6 +70,8 @@ spark-2.4.4-bin-hadoop2.7/bin/spark-shell \ --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' ``` +To build for integration tests that include `hudi-integ-test-bundle`, use `-Dintegration-tests`. + To build the Javadoc for all Java and Scala classes: ``` # Javadoc generated under target/site/apidocs @@ -72,35 +80,46 @@ mvn clean javadoc:aggregate -Pjavadocs ### Build with different Spark versions -The default Spark version supported is 2.4.4. To build for different Spark versions and Scala 2.12, use the -corresponding profile +The default Spark version supported is 2.4.4. Refer to the table below for building with different Spark and Scala versions. -| Label | Artifact Name for Spark Bundle | Maven Profile Option | Notes | -|--|--|--|--| -| Spark 2.4, Scala 2.11 | hudi-spark2.4-bundle_2.11 | `-Pspark2.4` | For Spark 2.4.4, which is the same as the default | -| Spark 2.4, Scala 2.12 | hudi-spark2.4-bundle_2.12 | `-Pspark2.4,scala-2.12` | For Spark 2.4.4, which is the same as the default and Scala 2.12 | -| Spark 3.1, Scala 2.12 | hudi-spark3.1-bundle_2.12 | `-Pspark3.1` | For Spark 3.1.x | -| Spark 3.2, Scala 2.12 | hudi-spark3.2-bundle_2.12 | `-Pspark3.2` | For Spark 3.2.x | -| Spark 3, Scala 2.12 | hudi-spark3-bundle_2.12 | `-Pspark3` | This is the same as `Spark 3.2, Scala 2.12` | -| Spark, Scala 2.11 | hudi-spark-bundle_2.11 | Default | The default profile, supporting Spark 2.4.4 | -| Spark, Scala 2.12 | hudi-spark-bundle_2.12 | `-Pscala-2.12` | The default profile (for Spark 2.4.4) with Scala 2.12 | +| Maven build options | Expected Spark bundle jar name | Notes | +|:--------------------------|:---------------------------------------------|:-------------------------------------------------| +| (empty) | hudi-spark-bundle_2.11 (legacy bundle name) | For Spark 2.4.4 and Scala 2.11 (default options) | +| `-Dspark2.4` | hudi-spark2.4-bundle_2.11 | For Spark 2.4.4 and Scala 2.11 (same as default) | +| `-Dspark2.4 -Dscala-2.12` | hudi-spark2.4-bundle_2.12 | For Spark 2.4.4 and Scala 2.12 | +| `-Dspark3.1 -Dscala-2.12` | hudi-spark3.1-bundle_2.12 | For Spark 3.1.x and Scala 2.12 | +| `-Dspark3.2 -Dscala-2.12` | hudi-spark3.2-bundle_2.12 | For Spark 3.2.x and Scala 2.12 | +| `-Dspark3` | hudi-spark3-bundle_2.12 (legacy bundle name) | For Spark 3.2.x and Scala 2.12 | +| `-Dscala-2.12` | hudi-spark-bundle_2.12 (legacy bundle name) | For Spark 2.4.4 and Scala 2.12 | For example, ``` -# Build against Spark 3.2.x (the default build shipped with the public Spark 3 bundle) -mvn clean package -DskipTests -Pspark3.2 +# Build against Spark 3.2.x +mvn clean package -DskipTests -Dspark3.2 -Dscala-2.12 # Build against Spark 3.1.x -mvn clean package -DskipTests -Pspark3.1 +mvn clean package -DskipTests -Dspark3.1 -Dscala-2.12 # Build against Spark 2.4.4 and Scala 2.12 -mvn clean package -DskipTests -Pspark2.4,scala-2.12 +mvn clean package -DskipTests -Dspark2.4 -Dscala-2.12 ``` -### What about "spark-avro" module? +#### What about "spark-avro" module? Starting from versions 0.11, Hudi no longer requires `spark-avro` to be specified using `--packages` +### Build with different Flink versions + +The default Flink version supported is 1.14. Refer to the table below for building with different Flink and Scala versions. + +| Maven build options | Expected Flink bundle jar name | Notes | +|:---------------------------|:-------------------------------|:------------------------------------------------| +| (empty) | hudi-flink1.14-bundle_2.11 | For Flink 1.14 and Scala 2.11 (default options) | +| `-Dflink1.14` | hudi-flink1.14-bundle_2.11 | For Flink 1.14 and Scala 2.11 (same as default) | +| `-Dflink1.14 -Dscala-2.12` | hudi-flink1.14-bundle_2.12 | For Flink 1.14 and Scala 2.12 | +| `-Dflink1.13` | hudi-flink1.13-bundle_2.11 | For Flink 1.13 and Scala 2.11 | +| `-Dflink1.13 -Dscala-2.12` | hudi-flink1.13-bundle_2.12 | For Flink 1.13 and Scala 2.12 | + ## Running Tests Unit tests can be run with maven profile `unit-tests`. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8ca54c1ab39ef..8a2d7f0de076a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -33,7 +33,7 @@ stages: jobs: - job: UT_FT_1 displayName: UT FT common & flink & UT client/spark-client - timeoutInMinutes: '90' + timeoutInMinutes: '120' steps: - task: Maven@3 displayName: maven install @@ -64,7 +64,7 @@ stages: mavenOptions: '-Xmx4g $(MAVEN_OPTS)' - job: UT_FT_2 displayName: FT client/spark-client - timeoutInMinutes: '90' + timeoutInMinutes: '120' steps: - task: Maven@3 displayName: maven install @@ -86,7 +86,7 @@ stages: mavenOptions: '-Xmx4g $(MAVEN_OPTS)' - job: UT_FT_3 displayName: UT FT clients & cli & utilities & sync/hive-sync - timeoutInMinutes: '90' + timeoutInMinutes: '120' steps: - task: Maven@3 displayName: maven install @@ -117,7 +117,7 @@ stages: mavenOptions: '-Xmx4g $(MAVEN_OPTS)' - job: UT_FT_4 displayName: UT FT other modules - timeoutInMinutes: '90' + timeoutInMinutes: '120' steps: - task: Maven@3 displayName: maven install @@ -148,8 +148,26 @@ stages: mavenOptions: '-Xmx4g $(MAVEN_OPTS)' - job: IT displayName: IT modules - timeoutInMinutes: '90' + timeoutInMinutes: '120' steps: + - task: Maven@3 + displayName: maven install + inputs: + mavenPomFile: 'pom.xml' + goals: 'clean install' + options: -T 2.5C -Pintegration-tests -DskipTests + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g $(MAVEN_OPTS)' + - task: Maven@3 + displayName: UT integ-test + inputs: + mavenPomFile: 'pom.xml' + goals: 'test' + options: -Pintegration-tests -DskipUTs=false -DskipITs=true -pl hudi-integ-test test + publishJUnitResults: false + jdkVersionOption: '1.8' + mavenOptions: '-Xmx4g $(MAVEN_OPTS)' - task: AzureCLI@2 displayName: Prepare for IT inputs: diff --git a/doap_HUDI.rdf b/doap_HUDI.rdf index a3b958a5cd7d0..3b359bdb26445 100644 --- a/doap_HUDI.rdf +++ b/doap_HUDI.rdf @@ -86,6 +86,11 @@ 2022-01-26 0.10.1 + + Apache Hudi 0.11.0 + 2022-04-30 + 0.11.0 + diff --git a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml index 086004f121e97..b8217fc0d0401 100644 --- a/docker/compose/docker-compose_hadoop284_hive233_spark244.yml +++ b/docker/compose/docker-compose_hadoop284_hive233_spark244.yml @@ -26,6 +26,8 @@ services: ports: - "50070:50070" - "8020:8020" + # JVM debugging port (will be mapped to a random port on host) + - "5005" env_file: - ./hadoop.env healthcheck: @@ -45,6 +47,8 @@ services: ports: - "50075:50075" - "50010:50010" + # JVM debugging port (will be mapped to a random port on host) + - "5005" links: - "namenode" - "historyserver" @@ -99,6 +103,8 @@ services: SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432" ports: - "9083:9083" + # JVM debugging port (will be mapped to a random port on host) + - "5005" healthcheck: test: ["CMD", "nc", "-z", "hivemetastore", "9083"] interval: 30s @@ -118,6 +124,8 @@ services: SERVICE_PRECONDITION: "hivemetastore:9083" ports: - "10000:10000" + # JVM debugging port (will be mapped to a random port on host) + - "5005" depends_on: - "hivemetastore" links: @@ -136,6 +144,8 @@ services: ports: - "8080:8080" - "7077:7077" + # JVM debugging port (will be mapped to a random port on host) + - "5005" environment: - INIT_DAEMON_STEP=setup_spark links: @@ -154,6 +164,8 @@ services: - sparkmaster ports: - "8081:8081" + # JVM debugging port (will be mapped to a random port on host) + - "5005" environment: - "SPARK_MASTER=spark://sparkmaster:7077" links: @@ -167,7 +179,7 @@ services: hostname: zookeeper container_name: zookeeper ports: - - '2181:2181' + - "2181:2181" environment: - ALLOW_ANONYMOUS_LOGIN=yes @@ -176,7 +188,7 @@ services: hostname: kafkabroker container_name: kafkabroker ports: - - '9092:9092' + - "9092:9092" environment: - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 - ALLOW_PLAINTEXT_LISTENER=yes @@ -186,7 +198,9 @@ services: hostname: presto-coordinator-1 image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest ports: - - '8090:8090' + - "8090:8090" + # JVM debugging port (will be mapped to a random port on host) + - "5005" environment: - PRESTO_JVM_MAX_HEAP=512M - PRESTO_QUERY_MAX_MEMORY=1GB @@ -226,7 +240,9 @@ services: hostname: trino-coordinator-1 image: apachehudi/hudi-hadoop_2.8.4-trinocoordinator_368:latest ports: - - '8091:8091' + - "8091:8091" + # JVM debugging port (will be mapped to a random port on host) + - "5005" links: - "hivemetastore" volumes: @@ -239,7 +255,9 @@ services: image: apachehudi/hudi-hadoop_2.8.4-trinoworker_368:latest depends_on: [ "trino-coordinator-1" ] ports: - - '8092:8092' + - "8092:8092" + # JVM debugging port (will be mapped to a random port on host) + - "5005" links: - "hivemetastore" - "hiveserver" @@ -268,6 +286,8 @@ services: - sparkmaster ports: - '4040:4040' + # JVM debugging port (mapped to 5006 on the host) + - "5006:5005" environment: - "SPARK_MASTER=spark://sparkmaster:7077" links: @@ -286,6 +306,9 @@ services: container_name: adhoc-2 env_file: - ./hadoop.env + ports: + # JVM debugging port (mapped to 5005 on the host) + - "5005:5005" depends_on: - sparkmaster environment: diff --git a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml index 57c8d010080a0..dc1e99a431209 100644 --- a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-metadata.yaml @@ -20,7 +20,7 @@ dag_content: first_insert: config: record_size: 1000 - num_partitions_insert: 5 + num_partitions_insert: 10 repeat_count: 1 num_records_insert: 1000 type: InsertNode @@ -28,7 +28,7 @@ dag_content: second_insert: config: record_size: 1000 - num_partitions_insert: 50 + num_partitions_insert: 10 repeat_count: 1 num_records_insert: 10000 deps: first_insert @@ -36,7 +36,7 @@ dag_content: third_insert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 10 repeat_count: 1 num_records_insert: 300 deps: second_insert @@ -44,16 +44,16 @@ dag_content: first_upsert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 10 num_records_insert: 300 repeat_count: 1 num_records_upsert: 100 - num_partitions_upsert: 1 + num_partitions_upsert: 10 type: UpsertNode deps: third_insert first_delete: config: - num_partitions_delete: 50 + num_partitions_delete: 10 num_records_delete: 4000 type: DeleteNode deps: first_upsert diff --git a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml index a29152bb45431..eca4eac1c710a 100644 --- a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml @@ -20,7 +20,7 @@ dag_content: first_insert: config: record_size: 1000 - num_partitions_insert: 5 + num_partitions_insert: 10 repeat_count: 1 num_records_insert: 1000 type: InsertNode @@ -28,7 +28,7 @@ dag_content: second_insert: config: record_size: 1000 - num_partitions_insert: 50 + num_partitions_insert: 10 repeat_count: 1 num_records_insert: 10000 deps: first_insert @@ -36,7 +36,7 @@ dag_content: third_insert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 10 repeat_count: 1 num_records_insert: 300 deps: second_insert @@ -44,16 +44,16 @@ dag_content: first_upsert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 10 num_records_insert: 300 repeat_count: 1 num_records_upsert: 100 - num_partitions_upsert: 1 + num_partitions_upsert: 10 type: UpsertNode deps: third_insert first_delete: config: - num_partitions_delete: 50 + num_partitions_delete: 10 num_records_delete: 4000 type: DeleteNode deps: first_upsert diff --git a/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml b/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml index 0cd4108cb6334..81c21a7be67c8 100644 --- a/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml +++ b/docker/demo/config/test-suite/deltastreamer-medium-clustering.yaml @@ -23,7 +23,7 @@ dag_content: first_insert: config: record_size: 1000 - num_partitions_insert: 5 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 1000 type: InsertNode @@ -39,7 +39,7 @@ dag_content: third_insert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 300 deps: second_insert @@ -47,11 +47,11 @@ dag_content: first_upsert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 num_records_insert: 300 repeat_count: 1 num_records_upsert: 100 - num_partitions_upsert: 1 + num_partitions_upsert: 50 type: UpsertNode deps: third_insert first_delete: diff --git a/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml b/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml index a20870f262d8b..a2d85a7a4d0f5 100644 --- a/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml +++ b/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml @@ -23,7 +23,7 @@ dag_content: first_insert: config: record_size: 1000 - num_partitions_insert: 5 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 1000 type: InsertNode @@ -39,7 +39,7 @@ dag_content: third_insert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 repeat_count: 1 num_records_insert: 300 deps: second_insert @@ -47,11 +47,11 @@ dag_content: first_upsert: config: record_size: 1000 - num_partitions_insert: 2 + num_partitions_insert: 50 num_records_insert: 300 repeat_count: 1 num_records_upsert: 100 - num_partitions_upsert: 1 + num_partitions_upsert: 50 type: UpsertNode deps: third_insert first_delete: diff --git a/docker/demo/config/test-suite/cow-spark-simple.yaml b/docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml similarity index 65% rename from docker/demo/config/test-suite/cow-spark-simple.yaml rename to docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml index 192adcf377dc0..a8be72e108136 100644 --- a/docker/demo/config/test-suite/cow-spark-simple.yaml +++ b/docker/demo/config/test-suite/deltastreamer-non-partitioned.yaml @@ -13,42 +13,51 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -dag_name: cow-spark-simple.yaml -dag_rounds: 1 -dag_intermittent_delay_mins: 1 +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 6 +dag_intermittent_delay_mins: 0 dag_content: first_insert: config: record_size: 1000 num_partitions_insert: 1 repeat_count: 1 - num_records_insert: 100 - type: SparkInsertNode + num_records_insert: 1000 + type: InsertNode deps: none - first_validate: + second_insert: config: - validate_hive: false - type: ValidateDatasetNode + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 deps: first_insert + type: InsertNode first_upsert: config: record_size: 1000 num_partitions_insert: 1 - num_records_insert: 50 + num_records_insert: 1000 repeat_count: 1 - num_records_upsert: 100 + num_records_upsert: 8000 num_partitions_upsert: 1 - type: SparkUpsertNode - deps: first_validate + type: UpsertNode + deps: second_insert first_delete: config: num_partitions_delete: 1 - num_records_delete: 30 - type: SparkDeleteNode + num_records_delete: 1000 + type: DeleteNode deps: first_upsert second_validate: config: + validate_once_every_itr : 3 validate_hive: false - delete_input_data: false + delete_input_data: true type: ValidateDatasetNode - deps: first_delete \ No newline at end of file + deps: first_delete + last_validate: + config: + execute_itr_count: 6 + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/insert-overwrite-table.yaml b/docker/demo/config/test-suite/insert-overwrite-table.yaml index 1a58abdcc4789..2251660b7028c 100644 --- a/docker/demo/config/test-suite/insert-overwrite-table.yaml +++ b/docker/demo/config/test-suite/insert-overwrite-table.yaml @@ -56,7 +56,7 @@ dag_content: first_insert_overwrite_table: config: record_size: 1000 - repeat_count: 10 + repeat_count: 1 num_records_insert: 10 type: SparkInsertOverwriteTableNode deps: second_upsert diff --git a/docker/demo/config/test-suite/cow-spark-long-running.yaml b/docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml similarity index 86% rename from docker/demo/config/test-suite/cow-spark-long-running.yaml rename to docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml index 00fea43f4578e..3c47729e66470 100644 --- a/docker/demo/config/test-suite/cow-spark-long-running.yaml +++ b/docker/demo/config/test-suite/spark-long-running-non-partitioned.yaml @@ -14,13 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. dag_name: cow-spark-deltastreamer-long-running-multi-partitions.yaml -dag_rounds: 30 +dag_rounds: 6 dag_intermittent_delay_mins: 0 dag_content: first_insert: config: record_size: 200 - num_partitions_insert: 50 + num_partitions_insert: 1 repeat_count: 1 num_records_insert: 10000 type: SparkInsertNode @@ -28,28 +28,28 @@ dag_content: first_upsert: config: record_size: 200 - num_partitions_insert: 50 + num_partitions_insert: 1 num_records_insert: 300 repeat_count: 1 num_records_upsert: 3000 - num_partitions_upsert: 50 + num_partitions_upsert: 1 type: SparkUpsertNode deps: first_insert first_delete: config: - num_partitions_delete: 50 - num_records_delete: 4000 + num_partitions_delete: 1 + num_records_delete: 1000 type: SparkDeleteNode deps: first_upsert second_validate: config: - validate_once_every_itr : 5 + validate_once_every_itr : 3 validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: first_delete last_validate: config: - execute_itr_count: 30 + execute_itr_count: 6 type: ValidateAsyncOperations deps: second_validate diff --git a/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 0000000000000..14427f323cead --- /dev/null +++ b/docker/demo/config/test-suite/test-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-aggressive-clean-archival.properties index 159c1f233185c..f0d9de251b869 100644 --- a/docker/demo/config/test-suite/test-aggressive-clean-archival.properties +++ b/docker/demo/config/test-suite/test-aggressive-clean-archival.properties @@ -18,18 +18,19 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.cleaner.commits.retained=8 hoodie.keep.min.commits=12 hoodie.keep.max.commits=14 +hoodie.embed.timeline.server=false hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 0000000000000..748972861851d --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,64 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties index d079536f95363..b94ccabb55e09 100644 --- a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties +++ b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties @@ -18,18 +18,19 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.cleaner.commits.retained=8 hoodie.keep.min.commits=12 hoodie.keep.max.commits=14 +hoodie.embed.timeline.server=false hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-clustering-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-inline-compact.properties new file mode 100644 index 0000000000000..5e86790c723a9 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-inline-compact.properties @@ -0,0 +1,60 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.embed.timeline.server=false + +hoodie.compact.inline=true +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 0000000000000..dd3089d190184 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,65 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.compact.inline=true +hoodie.metadata.enable=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties index 23b95f430408d..c10d6ecc48007 100644 --- a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties +++ b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties @@ -18,20 +18,21 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.cleaner.commits.retained=8 hoodie.keep.min.commits=12 hoodie.keep.max.commits=14 +hoodie.embed.timeline.server=false hoodie.metadata.enable=true hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-clustering.properties b/docker/demo/config/test-suite/test-clustering.properties index 9aa4843b2746e..677cf96751d77 100644 --- a/docker/demo/config/test-suite/test-clustering.properties +++ b/docker/demo/config/test-suite/test-clustering.properties @@ -18,14 +18,16 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.embed.timeline.server=false hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-inline-compact.properties b/docker/demo/config/test-suite/test-inline-compact.properties new file mode 100644 index 0000000000000..76de6bd2678e1 --- /dev/null +++ b/docker/demo/config/test-suite/test-inline-compact.properties @@ -0,0 +1,54 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties new file mode 100644 index 0000000000000..7921162356e2d --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival-inline-compact.properties @@ -0,0 +1,58 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.cleaner.commits.retained=8 +hoodie.keep.min.commits=12 +hoodie.keep.max.commits=14 + +hoodie.embed.timeline.server=false +hoodie.metadata.enable=true +hoodie.compact.inline=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties index 160da83004f44..5bad7fc4ef100 100644 --- a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties +++ b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties @@ -18,20 +18,21 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.cleaner.commits.retained=8 hoodie.keep.min.commits=12 hoodie.keep.max.commits=14 hoodie.metadata.enable=true +hoodie.embed.timeline.server=false hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-metadata-inline-compact.properties b/docker/demo/config/test-suite/test-metadata-inline-compact.properties new file mode 100644 index 0000000000000..5230a1488ca67 --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata-inline-compact.properties @@ -0,0 +1,58 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.plan.strategy.sort.columns=_row_key +hoodie.clustering.plan.strategy.daybased.lookback.partitions=0 +hoodie.clustering.inline.max.commits=1 + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata.properties b/docker/demo/config/test-suite/test-metadata.properties index 48da77c511e93..0edcd3c63d2ef 100644 --- a/docker/demo/config/test-suite/test-metadata.properties +++ b/docker/demo/config/test-suite/test-metadata.properties @@ -18,16 +18,17 @@ # under the License. # -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.metadata.enable=true +hoodie.embed.timeline.server=false hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties b/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties new file mode 100644 index 0000000000000..97f2bfa4978d3 --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-inline-compact.properties @@ -0,0 +1,61 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties b/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties new file mode 100644 index 0000000000000..2298be18fe91d --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-metadata-inline-compact.properties @@ -0,0 +1,61 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.compact.inline=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties b/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties new file mode 100644 index 0000000000000..520534f3b3e92 --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned-metadata.properties @@ -0,0 +1,60 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=true +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-nonpartitioned.properties b/docker/demo/config/test-suite/test-nonpartitioned.properties new file mode 100644 index 0000000000000..d51c4e5f843d3 --- /dev/null +++ b/docker/demo/config/test-suite/test-nonpartitioned.properties @@ -0,0 +1,60 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 + +hoodie.metadata.enable=false +hoodie.embed.timeline.server=false + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator +hoodie.datasource.write.partitionpath.field= + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test.properties b/docker/demo/config/test-suite/test.properties index 509b9f4ba628e..3b20d3286251a 100644 --- a/docker/demo/config/test-suite/test.properties +++ b/docker/demo/config/test-suite/test.properties @@ -15,16 +15,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -hoodie.insert.shuffle.parallelism=100 -hoodie.upsert.shuffle.parallelism=100 -hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.insert.shuffle.parallelism=25 +hoodie.upsert.shuffle.parallelism=25 +hoodie.bulkinsert.shuffle.parallelism=25 +hoodie.delete.shuffle.parallelism=25 hoodie.metadata.enable=false +hoodie.embed.timeline.server=false hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 -hoodie.embed.timeline.server=false hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector diff --git a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java index 97e47deed8173..81c05ed132a35 100644 --- a/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java +++ b/hudi-aws/src/main/java/org/apache/hudi/aws/sync/AWSGlueCatalogSyncClient.java @@ -94,13 +94,19 @@ public AWSGlueCatalogSyncClient(HiveSyncConfig syncConfig, Configuration hadoopC @Override public List getAllPartitions(String tableName) { try { - GetPartitionsRequest request = new GetPartitionsRequest(); - request.withDatabaseName(databaseName).withTableName(tableName); - GetPartitionsResult result = awsGlue.getPartitions(request); - return result.getPartitions() - .stream() - .map(p -> new Partition(p.getValues(), p.getStorageDescriptor().getLocation())) - .collect(Collectors.toList()); + List partitions = new ArrayList<>(); + String nextToken = null; + do { + GetPartitionsResult result = awsGlue.getPartitions(new GetPartitionsRequest() + .withDatabaseName(databaseName) + .withTableName(tableName) + .withNextToken(nextToken)); + partitions.addAll(result.getPartitions().stream() + .map(p -> new Partition(p.getValues(), p.getStorageDescriptor().getLocation())) + .collect(Collectors.toList())); + nextToken = result.getNextToken(); + } while (nextToken != null); + return partitions; } catch (Exception e) { throw new HoodieGlueSyncException("Failed to get all partitions for table " + tableId(databaseName, tableName), e); } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java index b1c5531a22fd0..49cc25b895730 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java @@ -67,7 +67,6 @@ import scala.Tuple2; import scala.Tuple3; -import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; import static org.apache.hudi.common.util.ValidationUtils.checkArgument; /** @@ -221,7 +220,6 @@ public String showLogFileRecords( .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()) .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) - .withPartition(getRelativePartitionPath(new Path(client.getBasePath()), new Path(logFilePaths.get(0)).getParent())) .build(); for (HoodieRecord hoodieRecord : scanner) { Option record = hoodieRecord.getData().getInsertValue(readerSchema); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java index 621061ae71122..ee7fbda11b783 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java @@ -65,7 +65,6 @@ import java.util.Map; import java.util.stream.Collectors; -import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; import static org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -204,7 +203,6 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc // get expected result of 10 records. List logFilePaths = Arrays.stream(fs.globStatus(new Path(partitionPath + "/*"))) .map(status -> status.getPath().toString()).collect(Collectors.toList()); - assertTrue(logFilePaths.size() > 0); HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(tablePath) @@ -223,7 +221,6 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc .withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()) .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) - .withPartition(getRelativePartitionPath(new Path(tablePath), new Path(logFilePaths.get(0)).getParent())) .build(); Iterator> records = scanner.iterator(); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java index 17075f9d3dfb6..97d3d91fb63ad 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java @@ -27,7 +27,6 @@ import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; @@ -37,7 +36,6 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.testutils.HoodieClientTestBase; - import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.junit.jupiter.api.BeforeEach; @@ -61,21 +59,18 @@ */ public class ITTestClusteringCommand extends AbstractShellIntegrationTest { - private String tablePath; - private String tableName; - @BeforeEach public void init() throws IOException { tableName = "test_table_" + ITTestClusteringCommand.class.getName(); - tablePath = Paths.get(basePath, tableName).toString(); + basePath = Paths.get(basePath, tableName).toString(); HoodieCLI.conf = jsc.hadoopConfiguration(); // Create table and connect new TableCommand().createTable( - tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + basePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); - metaClient.setBasePath(tablePath); - metaClient = HoodieTableMetaClient.reload(metaClient); + + initMetaClient(); } /** @@ -168,7 +163,7 @@ private void generateCommits() throws IOException { HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); // Create the write client to write some records in - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withDeleteParallelism(2).forTable(tableName) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java index 18f4a387d474e..fd533be09b6be 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java @@ -54,16 +54,16 @@ public class ITTestCommitsCommand extends AbstractShellIntegrationTest { @BeforeEach public void init() throws IOException { - String tableName = "test_table_" + ITTestCommitsCommand.class.getName(); - String tablePath = Paths.get(basePath, tableName).toString(); + tableName = "test_table_" + ITTestCommitsCommand.class.getName(); + basePath = Paths.get(basePath, tableName).toString(); HoodieCLI.conf = jsc.hadoopConfiguration(); // Create table and connect new TableCommand().createTable( - tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + basePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); - metaClient.setBasePath(tablePath); - metaClient = HoodieTableMetaClient.reload(metaClient); + + initMetaClient(); } /** diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java index 4734f45e7074b..267cee70f2893 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java @@ -32,7 +32,6 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; @@ -48,7 +47,6 @@ import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; - import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.springframework.shell.core.CommandResult; @@ -73,21 +71,18 @@ */ public class ITTestCompactionCommand extends AbstractShellIntegrationTest { - private String tablePath; - private String tableName; - @BeforeEach public void init() throws IOException { tableName = "test_table_" + ITTestCompactionCommand.class.getName(); - tablePath = Paths.get(basePath, tableName).toString(); + basePath = Paths.get(basePath, tableName).toString(); HoodieCLI.conf = jsc.hadoopConfiguration(); // Create table and connect new TableCommand().createTable( - tablePath, tableName, HoodieTableType.MERGE_ON_READ.name(), + basePath, tableName, HoodieTableType.MERGE_ON_READ.name(), "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); - metaClient.setBasePath(tablePath); - metaClient = HoodieTableMetaClient.reload(metaClient); + + initMetaClient(); } /** @@ -298,7 +293,7 @@ private void generateCommits() throws IOException { HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); // Create the write client to write some records in - HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withDeleteParallelism(2).forTable(tableName) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java index 32a8dee517389..4b747d3a77c00 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java @@ -795,7 +795,7 @@ public HoodieRestoreMetadata restoreToInstant(final String instantTime, boolean final String restoreInstantTime = HoodieActiveTimeline.createNewInstantTime(); Timer.Context timerContext = metrics.getRollbackCtx(); try { - HoodieTable table = initTable(WriteOperationType.UNKNOWN, Option.empty(), initialMetadataTableIfNecessary); + HoodieTable table = initTable(WriteOperationType.UNKNOWN, Option.of(restoreInstantTime), initialMetadataTableIfNecessary); Option restorePlanOption = table.scheduleRestore(context, restoreInstantTime, instantTime); if (restorePlanOption.isPresent()) { HoodieRestoreMetadata restoreMetadata = table.restore(context, restoreInstantTime, instantTime); @@ -1035,7 +1035,8 @@ public Option index(String indexInstantTime) { public void dropIndex(List partitionTypes) { HoodieTable table = createTable(config, hadoopConf); String dropInstant = HoodieActiveTimeline.createNewInstantTime(); - this.txnManager.beginTransaction(); + HoodieInstant ownerInstant = new HoodieInstant(true, HoodieTimeline.INDEXING_ACTION, dropInstant); + this.txnManager.beginTransaction(Option.of(ownerInstant), Option.empty()); try { context.setJobStatus(this.getClass().getSimpleName(), "Dropping partitions from metadata table"); table.getMetadataWriter(dropInstant).ifPresent(w -> { @@ -1046,7 +1047,7 @@ public void dropIndex(List partitionTypes) { } }); } finally { - this.txnManager.endTransaction(); + this.txnManager.endTransaction(Option.of(ownerInstant)); } } @@ -1451,17 +1452,20 @@ protected final HoodieTable initTable(WriteOperationType operationType, Option ownerInstant = Option.empty(); + if (instantTime.isPresent()) { + ownerInstant = Option.of(new HoodieInstant(true, CommitUtils.getCommitActionType(operationType, metaClient.getTableType()), instantTime.get())); + } + this.txnManager.beginTransaction(ownerInstant, Option.empty()); try { tryUpgrade(metaClient, instantTime); table = doInitTable(metaClient, instantTime, initialMetadataTableIfNecessary); } finally { - this.txnManager.endTransaction(); + this.txnManager.endTransaction(ownerInstant); } // Validate table properties - metaClient.validateTableProperties(config.getProps(), operationType); + metaClient.validateTableProperties(config.getProps()); // Make sure that FS View is in sync table.getHoodieView().sync(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java index ca76e4e3bf3ba..190a5fe1c6064 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java @@ -157,7 +157,8 @@ public boolean archiveIfRequired(HoodieEngineContext context) throws IOException public boolean archiveIfRequired(HoodieEngineContext context, boolean acquireLock) throws IOException { try { if (acquireLock) { - txnManager.beginTransaction(); + // there is no owner or instant time per se for archival. + txnManager.beginTransaction(Option.empty(), Option.empty()); } List instantsToArchive = getInstantsToArchive().collect(Collectors.toList()); verifyLastMergeArchiveFilesIfNecessary(context); @@ -179,7 +180,7 @@ public boolean archiveIfRequired(HoodieEngineContext context, boolean acquireLoc } finally { close(); if (acquireLock) { - txnManager.endTransaction(); + txnManager.endTransaction(Option.empty()); } } } @@ -587,19 +588,16 @@ public void archive(HoodieEngineContext context, List instants) t List records = new ArrayList<>(); for (HoodieInstant hoodieInstant : instants) { try { - if (table.getActiveTimeline().isEmpty(hoodieInstant) - && ( - hoodieInstant.getAction().equals(HoodieTimeline.CLEAN_ACTION) - || (hoodieInstant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION) && hoodieInstant.isCompleted()) - ) - ) { - table.getActiveTimeline().deleteEmptyInstantIfExists(hoodieInstant); + deleteAnyLeftOverMarkers(context, hoodieInstant); + // in local FS and HDFS, there could be empty completed instants due to crash. + if (table.getActiveTimeline().isEmpty(hoodieInstant) && hoodieInstant.isCompleted()) { + // lets add an entry to the archival, even if not for the plan. + records.add(createAvroRecordFromEmptyInstant(hoodieInstant)); } else { - deleteAnyLeftOverMarkers(context, hoodieInstant); records.add(convertToAvroRecord(hoodieInstant)); - if (records.size() >= this.config.getCommitArchivalBatchSize()) { - writeToFile(wrapperSchema, records); - } + } + if (records.size() >= this.config.getCommitArchivalBatchSize()) { + writeToFile(wrapperSchema, records); } } catch (Exception e) { LOG.error("Failed to archive commits, .commit file: " + hoodieInstant.getFileName(), e); @@ -636,4 +634,8 @@ private IndexedRecord convertToAvroRecord(HoodieInstant hoodieInstant) throws IOException { return MetadataConversionUtils.createMetaWrapper(hoodieInstant, metaClient); } + + private IndexedRecord createAvroRecordFromEmptyInstant(HoodieInstant hoodieInstant) throws IOException { + return MetadataConversionUtils.createMetaWrapperForEmptyInstant(hoodieInstant); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/TransactionManager.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/TransactionManager.java index d9b9d3d269bf7..aef1fee5e0794 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/TransactionManager.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/TransactionManager.java @@ -45,14 +45,6 @@ public TransactionManager(HoodieWriteConfig config, FileSystem fs) { this.isOptimisticConcurrencyControlEnabled = config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl(); } - public void beginTransaction() { - if (isOptimisticConcurrencyControlEnabled) { - LOG.info("Transaction starting without a transaction owner"); - lockManager.lock(); - LOG.info("Transaction started without a transaction owner"); - } - } - public void beginTransaction(Option newTxnOwnerInstant, Option lastCompletedTxnOwnerInstant) { if (isOptimisticConcurrencyControlEnabled) { @@ -65,30 +57,25 @@ public void beginTransaction(Option newTxnOwnerInstant, } } - public void endTransaction() { - if (isOptimisticConcurrencyControlEnabled) { - LOG.info("Transaction ending without a transaction owner"); - lockManager.unlock(); - LOG.info("Transaction ended without a transaction owner"); - } - } - public void endTransaction(Option currentTxnOwnerInstant) { if (isOptimisticConcurrencyControlEnabled) { LOG.info("Transaction ending with transaction owner " + currentTxnOwnerInstant); - reset(currentTxnOwnerInstant, Option.empty(), Option.empty()); - lockManager.unlock(); - LOG.info("Transaction ended with transaction owner " + currentTxnOwnerInstant); + if (reset(currentTxnOwnerInstant, Option.empty(), Option.empty())) { + lockManager.unlock(); + LOG.info("Transaction ended with transaction owner " + currentTxnOwnerInstant); + } } } - private synchronized void reset(Option callerInstant, + private synchronized boolean reset(Option callerInstant, Option newTxnOwnerInstant, Option lastCompletedTxnOwnerInstant) { if (!this.currentTxnOwnerInstant.isPresent() || this.currentTxnOwnerInstant.get().equals(callerInstant.get())) { this.currentTxnOwnerInstant = newTxnOwnerInstant; this.lastCompletedTxnOwnerInstant = lastCompletedTxnOwnerInstant; + return true; } + return false; } public void close() { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java index 913736cad8a91..ca15c4fdc2a13 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java @@ -70,11 +70,12 @@ public void lock() { } LOG.info("Retrying to acquire lock..."); Thread.sleep(maxWaitTimeInMs); - retryCount++; } catch (HoodieLockException | InterruptedException e) { if (retryCount >= maxRetries) { throw new HoodieLockException("Unable to acquire lock, lock object ", e); } + } finally { + retryCount++; } } if (!acquired) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/MetadataConversionUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/MetadataConversionUtils.java index d588a9c5dd0c9..342de74a11395 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/MetadataConversionUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/MetadataConversionUtils.java @@ -125,6 +125,46 @@ public static HoodieArchivedMetaEntry createMetaWrapper(HoodieInstant hoodieInst return archivedMetaWrapper; } + public static HoodieArchivedMetaEntry createMetaWrapperForEmptyInstant(HoodieInstant hoodieInstant) throws IOException { + HoodieArchivedMetaEntry archivedMetaWrapper = new HoodieArchivedMetaEntry(); + archivedMetaWrapper.setCommitTime(hoodieInstant.getTimestamp()); + archivedMetaWrapper.setActionState(hoodieInstant.getState().name()); + switch (hoodieInstant.getAction()) { + case HoodieTimeline.CLEAN_ACTION: { + archivedMetaWrapper.setActionType(ActionType.clean.name()); + break; + } + case HoodieTimeline.COMMIT_ACTION: { + archivedMetaWrapper.setActionType(ActionType.commit.name()); + break; + } + case HoodieTimeline.DELTA_COMMIT_ACTION: { + archivedMetaWrapper.setActionType(ActionType.deltacommit.name()); + break; + } + case HoodieTimeline.REPLACE_COMMIT_ACTION: { + archivedMetaWrapper.setActionType(ActionType.replacecommit.name()); + break; + } + case HoodieTimeline.ROLLBACK_ACTION: { + archivedMetaWrapper.setActionType(ActionType.rollback.name()); + break; + } + case HoodieTimeline.SAVEPOINT_ACTION: { + archivedMetaWrapper.setActionType(ActionType.savepoint.name()); + break; + } + case HoodieTimeline.COMPACTION_ACTION: { + archivedMetaWrapper.setActionType(ActionType.compaction.name()); + break; + } + default: { + throw new UnsupportedOperationException("Action not fully supported yet"); + } + } + return archivedMetaWrapper; + } + public static Option getInflightReplaceMetadata(HoodieTableMetaClient metaClient, HoodieInstant instant) throws IOException { Option inflightContent = metaClient.getActiveTimeline().getInstantDetails(instant); if (!inflightContent.isPresent() || inflightContent.get().length == 0) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java index 36f9d169faa47..eee6f4f4927e0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java @@ -87,7 +87,7 @@ public class HoodieClusteringConfig extends HoodieConfig { .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit") .defaultValue(String.valueOf(300 * 1024 * 1024L)) .sinceVersion("0.7.0") - .withDocumentation("Files smaller than the size specified here are candidates for clustering"); + .withDocumentation("Files smaller than the size in bytes specified here are candidates for clustering"); public static final ConfigProperty PARTITION_REGEX_PATTERN = ConfigProperty .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "partition.regex.pattern") diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLockConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLockConfig.java index 723c9f3cbff48..9ea28fbbd42e7 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLockConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieLockConfig.java @@ -92,7 +92,7 @@ public class HoodieLockConfig extends HoodieConfig { public static final ConfigProperty LOCK_ACQUIRE_CLIENT_NUM_RETRIES = ConfigProperty .key(LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY) - .defaultValue(String.valueOf(0)) + .defaultValue(String.valueOf(10)) .sinceVersion("0.8.0") .withDocumentation("Maximum number of times to retry to acquire lock additionally from the lock manager."); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java index 8845ccbeeec65..4e37796393a73 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieMemoryConfig.java @@ -65,17 +65,17 @@ public class HoodieMemoryConfig extends HoodieConfig { public static final ConfigProperty MAX_MEMORY_FOR_MERGE = ConfigProperty .key("hoodie.memory.merge.max.size") .defaultValue(DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES) - .withDocumentation("Maximum amount of memory used for merge operations, before spilling to local storage."); + .withDocumentation("Maximum amount of memory used in bytes for merge operations, before spilling to local storage."); public static final ConfigProperty MAX_MEMORY_FOR_COMPACTION = ConfigProperty .key("hoodie.memory.compaction.max.size") .noDefaultValue() - .withDocumentation("Maximum amount of memory used for compaction operations, before spilling to local storage."); + .withDocumentation("Maximum amount of memory used in bytes for compaction operations in bytes , before spilling to local storage."); public static final ConfigProperty MAX_DFS_STREAM_BUFFER_SIZE = ConfigProperty .key("hoodie.memory.dfs.buffer.max.size") .defaultValue(16 * 1024 * 1024) - .withDocumentation("Property to control the max memory for dfs input stream buffer size"); + .withDocumentation("Property to control the max memory in bytes for dfs input stream buffer size"); public static final ConfigProperty SPILLABLE_MAP_BASE_PATH = ConfigProperty .key("hoodie.memory.spillable.map.path") diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java index 6447a039cc069..ba3888863d557 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java @@ -42,25 +42,25 @@ public class HoodieStorageConfig extends HoodieConfig { public static final ConfigProperty PARQUET_MAX_FILE_SIZE = ConfigProperty .key("hoodie.parquet.max.file.size") .defaultValue(String.valueOf(120 * 1024 * 1024)) - .withDocumentation("Target size for parquet files produced by Hudi write phases. " + .withDocumentation("Target size in bytes for parquet files produced by Hudi write phases. " + "For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance."); public static final ConfigProperty PARQUET_BLOCK_SIZE = ConfigProperty .key("hoodie.parquet.block.size") .defaultValue(String.valueOf(120 * 1024 * 1024)) - .withDocumentation("Parquet RowGroup size. It's recommended to make this large enough that scan costs can be" + .withDocumentation("Parquet RowGroup size in bytes. It's recommended to make this large enough that scan costs can be" + " amortized by packing enough column values into a single row group."); public static final ConfigProperty PARQUET_PAGE_SIZE = ConfigProperty .key("hoodie.parquet.page.size") .defaultValue(String.valueOf(1 * 1024 * 1024)) - .withDocumentation("Parquet page size. Page is the unit of read within a parquet file. " + .withDocumentation("Parquet page size in bytes. Page is the unit of read within a parquet file. " + "Within a block, pages are compressed separately."); public static final ConfigProperty ORC_FILE_MAX_SIZE = ConfigProperty .key("hoodie.orc.max.file.size") .defaultValue(String.valueOf(120 * 1024 * 1024)) - .withDocumentation("Target file size for ORC base files."); + .withDocumentation("Target file size in bytes for ORC base files."); public static final ConfigProperty ORC_STRIPE_SIZE = ConfigProperty .key("hoodie.orc.stripe.size") @@ -75,12 +75,12 @@ public class HoodieStorageConfig extends HoodieConfig { public static final ConfigProperty HFILE_MAX_FILE_SIZE = ConfigProperty .key("hoodie.hfile.max.file.size") .defaultValue(String.valueOf(120 * 1024 * 1024)) - .withDocumentation("Target file size for HFile base files."); + .withDocumentation("Target file size in bytes for HFile base files."); public static final ConfigProperty HFILE_BLOCK_SIZE = ConfigProperty .key("hoodie.hfile.block.size") .defaultValue(String.valueOf(1024 * 1024)) - .withDocumentation("Lower values increase the size of metadata tracked within HFile, but can offer potentially " + .withDocumentation("Lower values increase the size in bytes of metadata tracked within HFile, but can offer potentially " + "faster lookup times."); public static final ConfigProperty LOGFILE_DATA_BLOCK_FORMAT = ConfigProperty @@ -91,13 +91,13 @@ public class HoodieStorageConfig extends HoodieConfig { public static final ConfigProperty LOGFILE_MAX_SIZE = ConfigProperty .key("hoodie.logfile.max.size") .defaultValue(String.valueOf(1024 * 1024 * 1024)) // 1 GB - .withDocumentation("LogFile max size. This is the maximum size allowed for a log file " + .withDocumentation("LogFile max size in bytes. This is the maximum size allowed for a log file " + "before it is rolled over to the next version."); public static final ConfigProperty LOGFILE_DATA_BLOCK_MAX_SIZE = ConfigProperty .key("hoodie.logfile.data.block.max.size") .defaultValue(String.valueOf(256 * 1024 * 1024)) - .withDocumentation("LogFile Data block max size. This is the maximum size allowed for a single data block " + .withDocumentation("LogFile Data block max size in bytes. This is the maximum size allowed for a single data block " + "to be appended to a log file. This helps to make sure the data appended to the log file is broken up " + "into sizable blocks to prevent from OOM errors. This size should be greater than the JVM memory."); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index d861ffe970c80..7b49a7a466785 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -150,7 +150,7 @@ public class HoodieWriteConfig extends HoodieConfig { .key("hoodie.table.base.file.format") .defaultValue(HoodieFileFormat.PARQUET) .withAlternatives("hoodie.table.ro.file.format") - .withDocumentation(""); + .withDocumentation("Base file format to store all the base file data."); public static final ConfigProperty BASE_PATH = ConfigProperty .key("hoodie.base.path") @@ -2012,7 +2012,9 @@ public WriteConcurrencyMode getWriteConcurrencyMode() { * @return True if any table services are configured to run inline, false otherwise. */ public Boolean areAnyTableServicesExecutedInline() { - return inlineClusteringEnabled() || inlineCompactionEnabled() || isAutoClean() || isAutoArchive(); + return areTableServicesEnabled() + && (inlineClusteringEnabled() || inlineCompactionEnabled() + || (isAutoClean() && !isAsyncClean()) || (isAutoArchive() && !isAsyncArchive())); } /** @@ -2021,9 +2023,10 @@ public Boolean areAnyTableServicesExecutedInline() { * @return True if any table services are configured to run async, false otherwise. */ public Boolean areAnyTableServicesAsync() { - return isAsyncClusteringEnabled() + return areTableServicesEnabled() + && (isAsyncClusteringEnabled() || (getTableType() == HoodieTableType.MERGE_ON_READ && !inlineCompactionEnabled()) - || isAsyncClean() || isAsyncArchive(); + || (isAutoClean() && isAsyncClean()) || (isAutoArchive() && isAsyncArchive())); } public Boolean areAnyTableServicesScheduledInline() { @@ -2465,6 +2468,8 @@ protected void setDefaults() { writeConfig.setDefaultValue(MARKERS_TYPE, getDefaultMarkersType(engineType)); // Check for mandatory properties writeConfig.setDefaults(HoodieWriteConfig.class.getName()); + // Set default values of HoodieHBaseIndexConfig + writeConfig.setDefaults(HoodieHBaseIndexConfig.class.getName()); // Make sure the props is propagated writeConfig.setDefaultOnCondition( !isIndexConfigSet, HoodieIndexConfig.newBuilder().withEngineType(engineType).fromProperties( diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java index c33c0f08ca830..022f600b5e078 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieConcatHandle.java @@ -20,6 +20,7 @@ import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -93,7 +94,8 @@ public HoodieConcatHandle(HoodieWriteConfig config, String instantTime, HoodieTa public void write(GenericRecord oldRecord) { String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt); try { - fileWriter.writeAvro(key, oldRecord); + // NOTE: We're enforcing preservation of the record metadata to keep existing semantic + writeToFile(new HoodieKey(key, partitionPath), oldRecord, true); } catch (IOException | RuntimeException e) { String errMsg = String.format("Failed to write old record into new file for key %s from old file %s to new file %s with writerSchema %s", key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true)); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java index 91a7622bf8065..41d583668a933 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java @@ -142,7 +142,7 @@ public void write(HoodieRecord record, Option avroRecord) { fileWriter.writeAvro(record.getRecordKey(), rewriteRecordWithMetadata((GenericRecord) avroRecord.get(), path.getName())); } else { - fileWriter.writeAvroWithMetadata(rewriteRecord((GenericRecord) avroRecord.get()), record); + fileWriter.writeAvroWithMetadata(record.getKey(), rewriteRecord((GenericRecord) avroRecord.get())); } // update the new location of record, so we know where to find it next record.unseal(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java index 06e752f59daea..3363571ddf0cb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; @@ -292,13 +293,7 @@ protected boolean writeRecord(HoodieRecord hoodieRecord, Option> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java index 931b08c2fe0c2..d6c1d1be40f36 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java @@ -47,7 +47,7 @@ */ public class HoodieSortedMergeHandle extends HoodieMergeHandle { - private Queue newRecordKeysSorted = new PriorityQueue<>(); + private final Queue newRecordKeysSorted = new PriorityQueue<>(); public HoodieSortedMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, Iterator> recordItr, String partitionPath, String fileId, TaskContextSupplier taskContextSupplier, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java index 89babc7725d6e..5d5760961a461 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java @@ -46,6 +46,9 @@ import java.io.IOException; import java.util.Collections; import java.util.List; +import java.util.HashMap; + +import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; /** * Base class for all write operations logically performed at the file group level. @@ -98,6 +101,8 @@ public abstract class HoodieWriteHandle protected final String fileId; protected final String writeToken; protected final TaskContextSupplier taskContextSupplier; + // For full schema evolution + protected final boolean schemaOnReadEnabled; public HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String partitionPath, String fileId, HoodieTable hoodieTable, TaskContextSupplier taskContextSupplier) { @@ -120,6 +125,7 @@ protected HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String !hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction()); this.taskContextSupplier = taskContextSupplier; this.writeToken = makeWriteToken(); + schemaOnReadEnabled = !isNullOrEmpty(hoodieTable.getConfig().getInternalSchema()); } /** @@ -224,11 +230,13 @@ public void write(HoodieRecord record, Option avroRecord, Option< * Rewrite the GenericRecord with the Schema containing the Hoodie Metadata fields. */ protected GenericRecord rewriteRecord(GenericRecord record) { - return HoodieAvroUtils.rewriteRecord(record, writeSchemaWithMetaFields); + return schemaOnReadEnabled ? HoodieAvroUtils.rewriteRecordWithNewSchema(record, writeSchemaWithMetaFields, new HashMap<>()) + : HoodieAvroUtils.rewriteRecord(record, writeSchemaWithMetaFields); } protected GenericRecord rewriteRecordWithMetadata(GenericRecord record, String fileName) { - return HoodieAvroUtils.rewriteRecordWithMetadata(record, writeSchemaWithMetaFields, fileName); + return schemaOnReadEnabled ? HoodieAvroUtils.rewriteEvolutionRecordWithMetadata(record, writeSchemaWithMetaFields, fileName) + : HoodieAvroUtils.rewriteRecordWithMetadata(record, writeSchemaWithMetaFields, fileName); } public abstract List close(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriter.java index 9f749566b255b..1d1dd5c9bae6d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriter.java @@ -21,6 +21,7 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.avro.generic.GenericRecord; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.avro.generic.IndexedRecord; @@ -29,7 +30,7 @@ public interface HoodieFileWriter { - void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException; + void writeAvroWithMetadata(HoodieKey key, R newRecord) throws IOException; boolean canWrite(); @@ -37,9 +38,9 @@ public interface HoodieFileWriter { void writeAvro(String key, R oldRecord) throws IOException; - default void prepRecordWithMetadata(R avroRecord, HoodieRecord record, String instantTime, Integer partitionId, AtomicLong recordIndex, String fileName) { + default void prepRecordWithMetadata(HoodieKey key, R avroRecord, String instantTime, Integer partitionId, AtomicLong recordIndex, String fileName) { String seqId = HoodieRecord.generateSequenceId(instantTime, partitionId, recordIndex.getAndIncrement()); - HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName); + HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, key.getRecordKey(), key.getPartitionPath(), fileName); HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, instantTime, seqId); return; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java index 1642eb2c42fc6..91f79cefa23d2 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java @@ -18,16 +18,6 @@ package org.apache.hudi.io.storage; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.engine.TaskContextSupplier; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.fs.HoodieWrapperFileSystem; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.StringUtils; - import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; @@ -40,6 +30,15 @@ import org.apache.hadoop.hbase.io.hfile.HFileContext; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; import org.apache.hadoop.io.Writable; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.engine.TaskContextSupplier; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import java.io.DataInput; import java.io.DataOutput; @@ -111,13 +110,13 @@ public HoodieHFileWriter(String instantTime, Path file, HoodieHFileConfig hfileC } @Override - public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { + public void writeAvroWithMetadata(HoodieKey key, R avroRecord) throws IOException { if (populateMetaFields) { - prepRecordWithMetadata(avroRecord, record, instantTime, + prepRecordWithMetadata(key, avroRecord, instantTime, taskContextSupplier.getPartitionIdSupplier().get(), recordIndex, file.getName()); - writeAvro(record.getRecordKey(), (IndexedRecord) avroRecord); + writeAvro(key.getRecordKey(), avroRecord); } else { - writeAvro(record.getRecordKey(), (IndexedRecord) avroRecord); + writeAvro(key.getRecordKey(), avroRecord); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcWriter.java index 3fe8be05c09f0..17d5ead3efb79 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieOrcWriter.java @@ -18,34 +18,35 @@ package org.apache.hudi.io.storage; -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY; -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE; -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER; -import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER; - -import java.io.Closeable; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.concurrent.atomic.AtomicLong; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.orc.storage.ql.exec.vector.ColumnVector; -import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; -import org.apache.orc.OrcFile; -import org.apache.orc.TypeDescription; -import org.apache.orc.Writer; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.HoodieWrapperFileSystem; -import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.AvroOrcUtils; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.storage.ql.exec.vector.ColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.atomic.AtomicLong; + +import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY; +import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE; +import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER; +import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER; public class HoodieOrcWriter implements HoodieFileWriter, Closeable { @@ -94,10 +95,10 @@ public HoodieOrcWriter(String instantTime, Path file, HoodieOrcConfig config, Sc } @Override - public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { - prepRecordWithMetadata(avroRecord, record, instantTime, + public void writeAvroWithMetadata(HoodieKey key, R avroRecord) throws IOException { + prepRecordWithMetadata(key, avroRecord, instantTime, taskContextSupplier.getPartitionIdSupplier().get(), RECORD_INDEX, file.getName()); - writeAvro(record.getRecordKey(), avroRecord); + writeAvro(key.getRecordKey(), avroRecord); } @Override diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java index 957a0ff52e91d..5b3c69ddf943e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java @@ -18,16 +18,15 @@ package org.apache.hudi.io.storage; +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroWriteSupport; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.HoodieWrapperFileSystem; -import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordPayload; - -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetWriter; @@ -84,12 +83,12 @@ public HoodieParquetWriter(String instantTime, } @Override - public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { + public void writeAvroWithMetadata(HoodieKey key, R avroRecord) throws IOException { if (populateMetaFields) { - prepRecordWithMetadata(avroRecord, record, instantTime, + prepRecordWithMetadata(key, avroRecord, instantTime, taskContextSupplier.getPartitionIdSupplier().get(), recordIndex, file.getName()); super.write(avroRecord); - writeSupport.add(record.getRecordKey()); + writeSupport.add(key.getRecordKey()); } else { super.write(avroRecord); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java index bce7e24c57a5f..c543fd26041a1 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java @@ -37,6 +37,7 @@ import java.io.Serializable; import java.math.BigDecimal; import java.sql.Timestamp; +import java.time.LocalDate; import java.util.TimeZone; import java.util.concurrent.TimeUnit; @@ -175,6 +176,9 @@ public String getPartitionPath(Object partitionVal) { timeMs = convertLongTimeToMillis(((Integer) partitionVal).longValue()); } else if (partitionVal instanceof BigDecimal) { timeMs = convertLongTimeToMillis(((BigDecimal) partitionVal).longValue()); + } else if (partitionVal instanceof LocalDate) { + // Avro uses LocalDate to represent the Date value internal. + timeMs = convertLongTimeToMillis(((LocalDate) partitionVal).toEpochDay()); } else if (partitionVal instanceof CharSequence) { if (!inputFormatter.isPresent()) { throw new HoodieException("Missing inputformatter. Ensure " + KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " config is set when timestampType is DATE_STRING or MIXED!"); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index 4faac22a841fe..d080d14a69fad 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -18,13 +18,9 @@ package org.apache.hudi.metadata; -import org.apache.avro.specific.SpecificRecordBase; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieIndexPartitionInfo; +import org.apache.hudi.avro.model.HoodieIndexPlan; import org.apache.hudi.avro.model.HoodieInstantInfo; import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.avro.model.HoodieRestoreMetadata; @@ -70,6 +66,12 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.exception.HoodieMetadataException; + +import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -88,6 +90,9 @@ import java.util.stream.Collectors; import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.getIndexInflightInstant; +import static org.apache.hudi.common.table.timeline.TimelineMetadataUtils.deserializeIndexPlan; import static org.apache.hudi.common.util.StringUtils.EMPTY_STRING; import static org.apache.hudi.metadata.HoodieTableMetadata.METADATA_TABLE_NAME_SUFFIX; import static org.apache.hudi.metadata.HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP; @@ -379,21 +384,24 @@ protected void initializeIfNeeded(HoodieTableMeta } // if metadata table exists, then check if any of the enabled partition types needs to be initialized - Set inflightAndCompletedPartitions = getInflightAndCompletedMetadataPartitions(dataMetaClient.getTableConfig()); - List partitionsToInit = this.enabledPartitionTypes.stream() - .filter(p -> !inflightAndCompletedPartitions.contains(p.getPartitionPath()) && !MetadataPartitionType.FILES.equals(p)) - .collect(Collectors.toList()); + // NOTE: It needs to be guarded by async index config because if that is enabled then initialization happens through the index scheduler. + if (!dataWriteConfig.isMetadataAsyncIndex()) { + Set inflightAndCompletedPartitions = getInflightAndCompletedMetadataPartitions(dataMetaClient.getTableConfig()); + LOG.info("Async metadata indexing enabled and following partitions already initialized: " + inflightAndCompletedPartitions); + List partitionsToInit = this.enabledPartitionTypes.stream() + .filter(p -> !inflightAndCompletedPartitions.contains(p.getPartitionPath()) && !MetadataPartitionType.FILES.equals(p)) + .collect(Collectors.toList()); + // if there are no partitions to initialize or there is a pending operation, then don't initialize in this round + if (partitionsToInit.isEmpty() || anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { + return; + } - // if there are no partitions to initialize or there is a pending operation, then don't initialize in this round - if (partitionsToInit.isEmpty() || anyPendingDataInstant(dataMetaClient, inflightInstantTimestamp)) { - return; + String createInstantTime = getInitialCommitInstantTime(dataMetaClient); + initTableMetadata(); // re-init certain flags in BaseTableMetadata + initializeEnabledFileGroups(dataMetaClient, createInstantTime, partitionsToInit); + initialCommit(createInstantTime, partitionsToInit); + updateInitializedPartitionsInTableConfig(partitionsToInit); } - - String createInstantTime = getInitialCommitInstantTime(dataMetaClient); - initTableMetadata(); // re-init certain flags in BaseTableMetadata - initializeEnabledFileGroups(dataMetaClient, createInstantTime, partitionsToInit); - initialCommit(createInstantTime, partitionsToInit); - updateInitializedPartitionsInTableConfig(partitionsToInit); } private boolean metadataTableExists(HoodieTableMetaClient dataMetaClient, @@ -557,6 +565,8 @@ private boolean anyPendingDataInstant(HoodieTableMetaClient dataMetaClient, Opti List pendingDataInstant = dataMetaClient.getActiveTimeline() .getInstants().filter(i -> !i.isCompleted()) .filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get())) + // regular writers should not be blocked due to pending indexing action + .filter(i -> !HoodieTimeline.INDEXING_ACTION.equals(i.getAction())) .collect(Collectors.toList()); if (!pendingDataInstant.isEmpty()) { @@ -722,9 +732,33 @@ public void dropMetadataPartitions(List metadataPartition HoodieTableConfig.update(dataMetaClient.getFs(), new Path(dataMetaClient.getMetaPath()), dataMetaClient.getTableConfig().getProps()); LOG.warn("Deleting Metadata Table partitions: " + partitionPath); dataMetaClient.getFs().delete(new Path(metadataWriteConfig.getBasePath(), partitionPath), true); + // delete corresponding pending indexing instant file in the timeline + LOG.warn("Deleting pending indexing instant from the timeline for partition: " + partitionPath); + deletePendingIndexingInstant(dataMetaClient, partitionPath); } } + /** + * Deletes any pending indexing instant, if it exists. + * It reads the plan from indexing.requested file and deletes both requested and inflight instants, + * if the partition path in the plan matches with the given partition path. + */ + private static void deletePendingIndexingInstant(HoodieTableMetaClient metaClient, String partitionPath) { + metaClient.reloadActiveTimeline().filterPendingIndexTimeline().getInstants().filter(instant -> REQUESTED.equals(instant.getState())) + .forEach(instant -> { + try { + HoodieIndexPlan indexPlan = deserializeIndexPlan(metaClient.getActiveTimeline().readIndexPlanAsBytes(instant).get()); + if (indexPlan.getIndexPartitionInfos().stream() + .anyMatch(indexPartitionInfo -> indexPartitionInfo.getMetadataPartitionPath().equals(partitionPath))) { + metaClient.getActiveTimeline().deleteInstantFileIfExists(instant); + metaClient.getActiveTimeline().deleteInstantFileIfExists(getIndexInflightInstant(instant.getTimestamp())); + } + } catch (IOException e) { + LOG.error("Failed to delete the instant file corresponding to " + instant); + } + }); + } + private MetadataRecordsGenerationParams getRecordsGenerationParams() { return new MetadataRecordsGenerationParams( dataMetaClient, @@ -774,11 +808,15 @@ private Set getMetadataPartitionsToUpdate() { // fetch partitions to update from table config Set partitionsToUpdate = getCompletedMetadataPartitions(dataMetaClient.getTableConfig()); // add inflight indexes as well because the file groups have already been initialized, so writers can log updates + // NOTE: Async HoodieIndexer can move some partition to inflight. While that partition is still being built, + // the regular ingestion writers should not be blocked. They can go ahead and log updates to the metadata partition. + // Instead of depending on enabledPartitionTypes, the table config becomes the source of truth for which partitions to update. partitionsToUpdate.addAll(getInflightMetadataPartitions(dataMetaClient.getTableConfig())); if (!partitionsToUpdate.isEmpty()) { return partitionsToUpdate; } // fallback to all enabled partitions if table config returned no partitions + LOG.warn("There are no partitions to update according to table config. Falling back to enabled partition types in the write config."); return getEnabledPartitionTypes().stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toSet()); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/BulkInsertPartitioner.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/BulkInsertPartitioner.java index fd1558a8232bb..63b502531a896 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/BulkInsertPartitioner.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/BulkInsertPartitioner.java @@ -18,12 +18,18 @@ package org.apache.hudi.table; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.io.WriteHandleFactory; + +import java.io.Serializable; + /** * Repartition input records into at least expected number of output spark partitions. It should give below guarantees - * Output spark partition will have records from only one hoodie partition. - Average records per output spark * partitions should be almost equal to (#inputRecords / #outputSparkPartitions) to avoid possible skews. */ -public interface BulkInsertPartitioner { +public interface BulkInsertPartitioner extends Serializable { /** * Repartitions the input records into at least expected number of output spark partitions. @@ -38,4 +44,24 @@ public interface BulkInsertPartitioner { * @return {@code true} if the records within a partition are sorted; {@code false} otherwise. */ boolean arePartitionRecordsSorted(); + + /** + * Return file group id prefix for the given data partition. + * By defauult, return a new file group id prefix, so that incoming records will route to a fresh new file group + * @param partitionId data partition + * @return + */ + default String getFileIdPfx(int partitionId) { + return FSUtils.createNewFileIdPfx(); + } + + /** + * Return write handle factory for the given partition. + * @param partitionId data partition + * @return + */ + default Option getWriteHandleFactory(int partitionId) { + return Option.empty(); + } + } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java index ad2145c3501bf..5355194ff75bf 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java @@ -42,7 +42,7 @@ public abstract HoodieWriteMetadata bulkInsert(I inputRecords, String instant public abstract O bulkInsert(I inputRecords, String instantTime, HoodieTable table, HoodieWriteConfig config, boolean performDedupe, - Option userDefinedBulkInsertPartitioner, + BulkInsertPartitioner partitioner, boolean addMetadataFields, int parallelism, WriteHandleFactory writeHandleFactory); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java index 578cdf0bc7f14..04dd29c63c5b4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java @@ -36,6 +36,7 @@ import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; import org.apache.hudi.internal.schema.utils.AvroSchemaEvolutionUtils; import org.apache.hudi.internal.schema.utils.SerDeHelper; +import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; @@ -52,6 +53,8 @@ import java.io.IOException; import java.util.Iterator; import java.util.List; +import java.util.HashMap; +import java.util.Map; import java.util.stream.Collectors; public class HoodieMergeHelper extends @@ -93,6 +96,7 @@ public void runMerge(HoodieTable>, HoodieData querySchemaOpt = SerDeHelper.fromJson(table.getConfig().getInternalSchema()); boolean needToReWriteRecord = false; + Map renameCols = new HashMap<>(); // TODO support bootstrap if (querySchemaOpt.isPresent() && !baseFile.getBootstrapBaseFile().isPresent()) { // check implicitly add columns, and position reorder(spark sql may change cols order) @@ -109,10 +113,14 @@ public void runMerge(HoodieTable>, HoodieData>, HoodieData execute() { Set requestedPartitions = partitionIndexTypes.stream().map(MetadataPartitionType::getPartitionPath).collect(Collectors.toSet()); requestedPartitions.removeAll(indexesInflightOrCompleted); if (!requestedPartitions.isEmpty()) { - LOG.warn(String.format("Following partitions already exist or inflight: %s. Going to index only these partitions: %s", + LOG.warn(String.format("Following partitions already exist or inflight: %s. Going to schedule indexing of only these partitions: %s", indexesInflightOrCompleted, requestedPartitions)); } else { LOG.error("All requested index types are inflight or completed: " + partitionIndexTypes); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java index ed37798607bd8..e3159abad8de7 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackStrategy.java @@ -239,7 +239,15 @@ private FileStatus[] fetchFilesFromCommitMetadata(HoodieInstant instantToRollbac SerializablePathFilter pathFilter = getSerializablePathFilter(baseFileExtension, instantToRollback.getTimestamp()); Path[] filePaths = getFilesFromCommitMetadata(basePath, commitMetadata, partitionPath); - return fs.listStatus(filePaths, pathFilter); + return fs.listStatus(Arrays.stream(filePaths).filter(entry -> { + try { + return fs.exists(entry); + } catch (IOException e) { + LOG.error("Exists check failed for " + entry.toString(), e); + } + // if IOException is thrown, do not ignore. lets try to add the file of interest to be deleted. we can't miss any files to be rolled back. + return false; + }).toArray(Path[]::new), pathFilter); } private FileStatus[] fetchFilesFromListFiles(HoodieInstant instantToRollback, String partitionPath, String basePath, diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java index 22f8017841a83..6573560e752eb 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java @@ -69,20 +69,28 @@ private HoodieWriteConfig getWriteConfig() { @Test public void testSingleWriterTransaction() { - transactionManager.beginTransaction(); - transactionManager.endTransaction(); + Option lastCompletedInstant = getInstant("0000001"); + Option newTxnOwnerInstant = getInstant("0000002"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + transactionManager.endTransaction(newTxnOwnerInstant); } @Test public void testSingleWriterNestedTransaction() { - transactionManager.beginTransaction(); + Option lastCompletedInstant = getInstant("0000001"); + Option newTxnOwnerInstant = getInstant("0000002"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + + Option lastCompletedInstant1 = getInstant("0000003"); + Option newTxnOwnerInstant1 = getInstant("0000004"); + assertThrows(HoodieLockException.class, () -> { - transactionManager.beginTransaction(); + transactionManager.beginTransaction(newTxnOwnerInstant1, lastCompletedInstant1); }); - transactionManager.endTransaction(); + transactionManager.endTransaction(newTxnOwnerInstant); assertDoesNotThrow(() -> { - transactionManager.endTransaction(); + transactionManager.endTransaction(newTxnOwnerInstant1); }); } @@ -94,11 +102,16 @@ public void testMultiWriterTransactions() { final AtomicBoolean writer1Completed = new AtomicBoolean(false); final AtomicBoolean writer2Completed = new AtomicBoolean(false); + Option lastCompletedInstant1 = getInstant("0000001"); + Option newTxnOwnerInstant1 = getInstant("0000002"); + Option lastCompletedInstant2 = getInstant("0000003"); + Option newTxnOwnerInstant2 = getInstant("0000004"); + // Let writer1 get the lock first, then wait for others // to join the sync up point. Thread writer1 = new Thread(() -> { assertDoesNotThrow(() -> { - transactionManager.beginTransaction(); + transactionManager.beginTransaction(newTxnOwnerInstant1, lastCompletedInstant1); }); latch.countDown(); try { @@ -111,7 +124,7 @@ public void testMultiWriterTransactions() { // } assertDoesNotThrow(() -> { - transactionManager.endTransaction(); + transactionManager.endTransaction(newTxnOwnerInstant1); }); writer1Completed.set(true); }); @@ -127,10 +140,10 @@ public void testMultiWriterTransactions() { // } assertDoesNotThrow(() -> { - transactionManager.beginTransaction(); + transactionManager.beginTransaction(newTxnOwnerInstant2, lastCompletedInstant2); }); assertDoesNotThrow(() -> { - transactionManager.endTransaction(); + transactionManager.endTransaction(newTxnOwnerInstant2); }); writer2Completed.set(true); }); @@ -152,6 +165,32 @@ public void testMultiWriterTransactions() { Assertions.assertTrue(writer2Completed.get()); } + @Test + public void testEndTransactionByDiffOwner() throws InterruptedException { + // 1. Begin and end by the same transaction owner + Option lastCompletedInstant = getInstant("0000001"); + Option newTxnOwnerInstant = getInstant("0000002"); + transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); + + CountDownLatch countDownLatch = new CountDownLatch(1); + // Another writer thread + Thread writer2 = new Thread(() -> { + Option newTxnOwnerInstant1 = getInstant("0000003"); + transactionManager.endTransaction(newTxnOwnerInstant1); + countDownLatch.countDown(); + }); + + writer2.start(); + countDownLatch.await(30, TimeUnit.SECONDS); + // should not have reset the state within transaction manager since the owner is different. + Assertions.assertTrue(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertTrue(transactionManager.getLastCompletedTransactionOwner().isPresent()); + + transactionManager.endTransaction(newTxnOwnerInstant); + Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); + Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); + } + @Test public void testTransactionsWithInstantTime() { // 1. Begin and end by the same transaction owner @@ -164,14 +203,15 @@ public void testTransactionsWithInstantTime() { Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); - // 2. Begin transaction with a new txn owner, but end transaction with no/wrong owner + // 2. Begin transaction with a new txn owner, but end transaction with wrong owner lastCompletedInstant = getInstant("0000002"); newTxnOwnerInstant = getInstant("0000003"); transactionManager.beginTransaction(newTxnOwnerInstant, lastCompletedInstant); - transactionManager.endTransaction(); + transactionManager.endTransaction(getInstant("0000004")); // Owner reset would not happen as the end txn was invoked with an incorrect current txn owner Assertions.assertTrue(transactionManager.getCurrentTransactionOwner() == newTxnOwnerInstant); Assertions.assertTrue(transactionManager.getLastCompletedTransactionOwner() == lastCompletedInstant); + transactionManager.endTransaction(newTxnOwnerInstant); // 3. But, we should be able to begin a new transaction for a new owner lastCompletedInstant = getInstant("0000003"); @@ -183,15 +223,7 @@ public void testTransactionsWithInstantTime() { Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); - // 4. Transactions with no owners should also go through - transactionManager.beginTransaction(); - Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); - Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); - transactionManager.endTransaction(); - Assertions.assertFalse(transactionManager.getCurrentTransactionOwner().isPresent()); - Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); - - // 5. Transactions with new instants but with same timestamps should properly reset owners + // 4. Transactions with new instants but with same timestamps should properly reset owners transactionManager.beginTransaction(getInstant("0000005"), Option.empty()); Assertions.assertTrue(transactionManager.getCurrentTransactionOwner().isPresent()); Assertions.assertFalse(transactionManager.getLastCompletedTransactionOwner().isPresent()); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java index 85d40964b8fd2..3d0da3e49a7f8 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java @@ -29,7 +29,6 @@ import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.marker.MarkerType; -import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig.Builder; import org.apache.hudi.index.HoodieIndex; @@ -48,6 +47,7 @@ import java.util.function.Function; import static org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE; +import static org.apache.hudi.config.HoodieCompactionConfig.ASYNC_ARCHIVE; import static org.apache.hudi.config.HoodieCompactionConfig.ASYNC_CLEAN; import static org.apache.hudi.config.HoodieCompactionConfig.AUTO_CLEAN; import static org.apache.hudi.config.HoodieCompactionConfig.FAILED_WRITES_CLEANER_POLICY; @@ -138,7 +138,7 @@ public void testAutoConcurrencyConfigAdjustmentWithTableServices(HoodieTableType put(ASYNC_CLEAN.key(), "false"); put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } - }), true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + }), true, true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, HoodieFailedWritesCleaningPolicy.LAZY, inProcessLockProviderClassName); // 2. Async clean @@ -151,7 +151,7 @@ public void testAutoConcurrencyConfigAdjustmentWithTableServices(HoodieTableType put(ASYNC_CLEAN.key(), "true"); put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } - }), true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + }), true, true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, HoodieFailedWritesCleaningPolicy.LAZY, inProcessLockProviderClassName); // 3. Async compaction configured @@ -165,7 +165,7 @@ public void testAutoConcurrencyConfigAdjustmentWithTableServices(HoodieTableType put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } }), true, - tableType == HoodieTableType.MERGE_ON_READ, + tableType == HoodieTableType.MERGE_ON_READ, true, tableType == HoodieTableType.MERGE_ON_READ ? WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL : WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), @@ -186,10 +186,25 @@ public void testAutoConcurrencyConfigAdjustmentWithTableServices(HoodieTableType put(ASYNC_CLEAN.key(), "false"); put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } - }), Option.of(true), Option.of(false), Option.of(true), + }), true, false, true, WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); + + // 5. All async services + verifyConcurrencyControlRelatedConfigs(createWriteConfig(new HashMap() { + { + put(HoodieTableConfig.TYPE.key(), tableType.name()); + put(ASYNC_CLUSTERING_ENABLE.key(), "true"); + put(INLINE_COMPACT.key(), "false"); + put(AUTO_CLEAN.key(), "true"); + put(ASYNC_CLEAN.key(), "true"); + put(ASYNC_ARCHIVE.key(), "true"); + put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); + } + }), true, true, false, + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + HoodieFailedWritesCleaningPolicy.LAZY, inProcessLockProviderClassName); } @ParameterizedTest @@ -205,7 +220,7 @@ public void testAutoAdjustLockConfigs(HoodieTableType tableType) { .build(); verifyConcurrencyControlRelatedConfigs(writeConfig, - true, true, + true, true, true, WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); @@ -219,7 +234,7 @@ public void testAutoAdjustLockConfigs(HoodieTableType tableType) { .build(); verifyConcurrencyControlRelatedConfigs(writeConfig, - true, true, + true, true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, HoodieFailedWritesCleaningPolicy.LAZY, HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); } @@ -240,7 +255,7 @@ public void testAutoConcurrencyConfigAdjustmentWithUserConfigs(HoodieTableType t .build(); verifyConcurrencyControlRelatedConfigs(writeConfig, - true, tableType == HoodieTableType.MERGE_ON_READ, + true, tableType == HoodieTableType.MERGE_ON_READ, true, WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), FileSystemBasedLockProviderTestClass.class.getName()); @@ -257,7 +272,7 @@ public void testAutoConcurrencyConfigAdjustmentWithUserConfigs(HoodieTableType t ZookeeperBasedLockProvider.class.getName()); put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } - }), true, true, + }), true, true, true, WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), ZookeeperBasedLockProvider.class.getName()); @@ -271,13 +286,13 @@ public void testAutoConcurrencyConfigAdjustmentWithUserConfigs(HoodieTableType t }); if (writeConfig.areAnyTableServicesAsync()) { verifyConcurrencyControlRelatedConfigs(writeConfig, - true, true, + true, true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, HoodieFailedWritesCleaningPolicy.LAZY, InProcessLockProvider.class.getName()); } else { verifyConcurrencyControlRelatedConfigs(writeConfig, - true, false, + true, false, true, WriteConcurrencyMode.valueOf(WRITE_CONCURRENCY_MODE.defaultValue()), HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); @@ -294,7 +309,7 @@ public void testAutoConcurrencyConfigAdjustmentWithNoTableService(HoodieTableTyp put(TABLE_SERVICES_ENABLED.key(), "false"); put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } - }), false, tableType == HoodieTableType.MERGE_ON_READ, + }), false, false, false, WriteConcurrencyMode.fromValue(WRITE_CONCURRENCY_MODE.defaultValue()), HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); @@ -311,7 +326,7 @@ public void testAutoConcurrencyConfigAdjustmentWithNoTableService(HoodieTableTyp FileSystemBasedLockProviderTestClass.class.getName()); put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } - }), false, tableType == HoodieTableType.MERGE_ON_READ, + }), false, false, false, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, HoodieFailedWritesCleaningPolicy.LAZY, FileSystemBasedLockProviderTestClass.class.getName()); @@ -332,7 +347,7 @@ public void testAutoConcurrencyConfigAdjustmentWithMetadataTableDisabled(HoodieT put(ASYNC_CLEAN.key(), "false"); put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } - }), true, true, + }), true, true, true, WriteConcurrencyMode.fromValue(WRITE_CONCURRENCY_MODE.defaultValue()), HoodieFailedWritesCleaningPolicy.valueOf(FAILED_WRITES_CLEANER_POLICY.defaultValue()), HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue()); @@ -351,7 +366,8 @@ public void testAutoConcurrencyConfigAdjustmentWithMetadataTableDisabled(HoodieT FileSystemBasedLockProviderTestClass.class.getName()); put(HoodieWriteConfig.AUTO_ADJUST_LOCK_CONFIGS.key(), "true"); } - }), true, true, WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, + }), true, true, true, + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL, HoodieFailedWritesCleaningPolicy.LAZY, FileSystemBasedLockProviderTestClass.class.getName()); } @@ -415,32 +431,14 @@ private Map constructConfigMap( private void verifyConcurrencyControlRelatedConfigs( HoodieWriteConfig writeConfig, boolean expectedTableServicesEnabled, boolean expectedAnyTableServicesAsync, + boolean expectedAnyTableServicesExecutedInline, WriteConcurrencyMode expectedConcurrencyMode, HoodieFailedWritesCleaningPolicy expectedCleanPolicy, String expectedLockProviderName) { - verifyConcurrencyControlRelatedConfigs(writeConfig, Option.of(expectedTableServicesEnabled), - Option.of(expectedAnyTableServicesAsync), Option.empty(), expectedConcurrencyMode, - expectedCleanPolicy, expectedLockProviderName); - } - - private void verifyConcurrencyControlRelatedConfigs( - HoodieWriteConfig writeConfig, Option expectedTableServicesEnabled, - Option expectedAnyTableServicesAsync, - Option expectedAnyTableServicesExecutedInline, - WriteConcurrencyMode expectedConcurrencyMode, - HoodieFailedWritesCleaningPolicy expectedCleanPolicy, - String expectedLockProviderName) { - if (expectedTableServicesEnabled.isPresent()) { - assertEquals(expectedTableServicesEnabled.get(), writeConfig.areTableServicesEnabled()); - } - if (expectedAnyTableServicesAsync.isPresent()) { - assertEquals(expectedAnyTableServicesAsync.get(), writeConfig.areAnyTableServicesAsync()); - } - if (expectedAnyTableServicesExecutedInline.isPresent()) { - assertEquals(expectedAnyTableServicesExecutedInline.get(), - writeConfig.areAnyTableServicesExecutedInline()); - } - + assertEquals(expectedTableServicesEnabled, writeConfig.areTableServicesEnabled()); + assertEquals(expectedAnyTableServicesAsync, writeConfig.areAnyTableServicesAsync()); + assertEquals( + expectedAnyTableServicesExecutedInline, writeConfig.areAnyTableServicesExecutedInline()); assertEquals(expectedConcurrencyMode, writeConfig.getWriteConcurrencyMode()); assertEquals(expectedCleanPolicy, writeConfig.getFailedWritesCleanPolicy()); assertEquals(expectedLockProviderName, writeConfig.getLockProviderClass()); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java index 2db8eb0204b34..da6f717258877 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java @@ -154,8 +154,9 @@ public void testWriteReadHFileWithMetaFields(boolean populateMetaFields, boolean record.put("time", Integer.toString(RANDOM.nextInt())); record.put("number", i); if (testAvroWithMeta) { - writer.writeAvroWithMetadata(record, new HoodieAvroRecord(new HoodieKey((String) record.get("_row_key"), - Integer.toString((Integer) record.get("number"))), new EmptyHoodieRecordPayload())); // payload does not matter. GenericRecord passed in is what matters + // payload does not matter. GenericRecord passed in is what matters + writer.writeAvroWithMetadata(new HoodieAvroRecord(new HoodieKey((String) record.get("_row_key"), + Integer.toString((Integer) record.get("number"))), new EmptyHoodieRecordPayload()).getKey(), record); // only HoodieKey will be looked up from the 2nd arg(HoodieRecord). } else { writer.writeAvro(key, record); diff --git a/hudi-client/hudi-client-common/src/test/resources/log4j-surefire.properties b/hudi-client/hudi-client-common/src/test/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-client/hudi-client-common/src/test/resources/log4j-surefire.properties +++ b/hudi-client/hudi-client-common/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-client/hudi-flink-client/src/main/resources/log4j-surefire.properties b/hudi-client/hudi-flink-client/src/main/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-client/hudi-flink-client/src/main/resources/log4j-surefire.properties +++ b/hudi-client/hudi-flink-client/src/main/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-client/hudi-flink-client/src/test/resources/log4j-surefire.properties b/hudi-client/hudi-flink-client/src/test/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-client/hudi-flink-client/src/test/resources/log4j-surefire.properties +++ b/hudi-client/hudi-flink-client/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java index 7d7609f0fa0a9..233c70ecf9eb6 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java @@ -41,6 +41,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.execution.bulkinsert.JavaBulkInsertInternalPartitionerFactory; import org.apache.hudi.execution.bulkinsert.JavaCustomColumnsSortPartitioner; import org.apache.hudi.io.IOUtils; import org.apache.hudi.io.storage.HoodieFileReader; @@ -121,16 +122,16 @@ public abstract List performClusteringWithRecordList( * * @param strategyParams Strategy parameters containing columns to sort the data by when clustering. * @param schema Schema of the data including metadata fields. - * @return empty for now. + * @return partitioner for the java engine */ - protected Option>>> getPartitioner(Map strategyParams, Schema schema) { + protected BulkInsertPartitioner>> getPartitioner(Map strategyParams, Schema schema) { if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { - return Option.of(new JavaCustomColumnsSortPartitioner( + return new JavaCustomColumnsSortPartitioner( strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()).split(","), HoodieAvroUtils.addMetadataFields(schema), - getWriteConfig().isConsistentLogicalTimestampEnabled())); + getWriteConfig().isConsistentLogicalTimestampEnabled()); } else { - return Option.empty(); + return JavaBulkInsertInternalPartitionerFactory.get(getWriteConfig().getBulkInsertSortMode()); } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java index 39b2916732f2a..e126372aa9068 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java @@ -77,8 +77,11 @@ public HoodieWriteMetadata> bulkInsert(final List writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false)); + List writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, partitioner, false, + config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false)); //update index ((BaseJavaCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatuses, result); return result; @@ -90,7 +93,7 @@ public List bulkInsert(List> inputRecords, HoodieTable>, List, List> table, HoodieWriteConfig config, boolean performDedupe, - Option userDefinedBulkInsertPartitioner, + BulkInsertPartitioner partitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) { @@ -103,12 +106,7 @@ public List bulkInsert(List> inputRecords, parallelism, table); } - final List> repartitionedRecords; - BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() - ? userDefinedBulkInsertPartitioner.get() - : JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode()); - // only List is supported for Java partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463 - repartitionedRecords = (List>) partitioner.repartitionRecords(dedupedRecords, parallelism); + final List> repartitionedRecords = (List>) partitioner.repartitionRecords(dedupedRecords, parallelism); FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass( config.getFileIdPrefixProviderClassName(), @@ -119,7 +117,8 @@ public List bulkInsert(List> inputRecords, new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true, config, instantTime, table, fileIdPrefixProvider.createFilePrefix(""), table.getTaskContextSupplier(), - new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll); + // Always get the first WriteHandleFactory, as there is only a single data partition for hudi java engine. + (WriteHandleFactory) partitioner.getWriteHandleFactory(0).orElse(writeHandleFactory)).forEachRemaining(writeStatuses::addAll); return writeStatuses; } diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java index 1bf1b4cccbf51..518414d614e8f 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java @@ -318,7 +318,7 @@ private void verifyStatusResult(List statuses, Map ex } @Test - public void testInsertRecords() throws Exception { + public void testInsertRecords() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); String instantTime = makeNewCommitTime(); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -465,6 +465,90 @@ public void testBulkInsertRecords(String bulkInsertMode) throws Exception { verifyStatusResult(returnedStatuses, generateExpectedPartitionNumRecords(inputRecords)); } + @Test + public void testDeleteRecords() throws Exception { + // Prepare the AvroParquetIO + HoodieWriteConfig config = makeHoodieClientConfig(); + int startInstant = 1; + String firstCommitTime = makeNewCommitTime(startInstant++, "%09d"); + HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); + writeClient.startCommitWithTime(firstCommitTime); + metaClient = HoodieTableMetaClient.reload(metaClient); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + String partitionPath = "2022/04/09"; + + // Get some records belong to the same partition (2016/01/31) + String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2022-04-09T03:16:41.415Z\",\"number\":1}"; + String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2022-04-09T03:20:41.415Z\",\"number\":2}"; + String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + + "\"time\":\"2022-04-09T03:16:41.415Z\",\"number\":3}"; + + List records = new ArrayList<>(); + RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + + // Insert new records + writeClient.insert(records, firstCommitTime); + + FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path filePath = allFiles[0].getPath(); + BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath); + for (HoodieRecord record : records) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + + // Read the base file, check the record content + List fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + int index = 0; + for (GenericRecord record : fileRecords) { + assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString()); + index++; + } + + String newCommitTime = makeNewCommitTime(startInstant++, "%09d"); + writeClient.startCommitWithTime(newCommitTime); + + // Test delete two records + List keysForDelete = new ArrayList(Arrays.asList(records.get(0).getKey(), records.get(2).getKey())); + writeClient.delete(keysForDelete, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + filePath = allFiles[0].getPath(); + // Read the base file, check the record content + fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + // Check that the two records are deleted successfully + assertEquals(1, fileRecords.size()); + assertEquals(records.get(1).getRecordKey(), fileRecords.get(0).get("_row_key").toString()); + + newCommitTime = makeNewCommitTime(startInstant++, "%09d"); + writeClient.startCommitWithTime(newCommitTime); + + // Test delete last record + keysForDelete = new ArrayList(Arrays.asList(records.get(1).getKey())); + writeClient.delete(keysForDelete, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + filePath = allFiles[0].getPath(); + // Read the base file, check the record content + fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + // Check whether all records have been deleted + assertEquals(0, fileRecords.size()); + } + public static Map generateExpectedPartitionNumRecords(List records) { return records.stream().map(record -> Pair.of(record.getPartitionPath(), 1)) .collect(Collectors.groupingBy(Pair::getLeft, Collectors.counting())); diff --git a/hudi-client/hudi-java-client/src/test/resources/log4j-surefire.properties b/hudi-client/hudi-java-client/src/test/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-client/hudi-java-client/src/test/resources/log4j-surefire.properties +++ b/hudi-client/hudi-java-client/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java index 5a03cdf3bc9a1..e09457f0e5135 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java @@ -46,6 +46,7 @@ import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory; import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; import org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner; import org.apache.hudi.io.IOUtils; @@ -137,7 +138,7 @@ public abstract HoodieData performClusteringWithRecordsRDD(final Ho * @param schema Schema of the data including metadata fields. * @return {@link RDDCustomColumnsSortPartitioner} if sort columns are provided, otherwise empty. */ - protected Option>>> getPartitioner(Map strategyParams, Schema schema) { + protected BulkInsertPartitioner>> getPartitioner(Map strategyParams, Schema schema) { Option orderByColumnsOpt = Option.ofNullable(strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key())) .map(listStr -> listStr.split(",")); @@ -159,7 +160,7 @@ protected Option>>> getPartitioner default: throw new UnsupportedOperationException(String.format("Layout optimization strategy '%s' is not supported", layoutOptStrategy)); } - }); + }).orElse(BulkInsertInternalPartitionerFactory.get(getWriteConfig().getBulkInsertSortMode())); } /** diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java index 4a7ee7bceeacd..b61017c34ce41 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java @@ -72,6 +72,7 @@ public HoodieData performClusteringWithRecordsRDD(HoodieData) SparkBulkInsertHelper.newInstance().bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig, false, getPartitioner(strategyParams, schema), true, numOutputGroups, new SingleFileHandleCreateFactory(fileGroupIdList.get(0).getFileId(), preserveHoodieMetadata)); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java index 24cdd70603cbe..66c3bdddcb1ef 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java @@ -24,6 +24,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.execution.SparkLazyInsertIterable; import org.apache.hudi.io.WriteHandleFactory; +import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.function.Function2; @@ -41,27 +42,27 @@ public class BulkInsertMapFunction private boolean areRecordsSorted; private HoodieWriteConfig config; private HoodieTable hoodieTable; - private List fileIDPrefixes; private boolean useWriterSchema; + private BulkInsertPartitioner partitioner; private WriteHandleFactory writeHandleFactory; public BulkInsertMapFunction(String instantTime, boolean areRecordsSorted, HoodieWriteConfig config, HoodieTable hoodieTable, - List fileIDPrefixes, boolean useWriterSchema, + boolean useWriterSchema, BulkInsertPartitioner partitioner, WriteHandleFactory writeHandleFactory) { this.instantTime = instantTime; this.areRecordsSorted = areRecordsSorted; this.config = config; this.hoodieTable = hoodieTable; - this.fileIDPrefixes = fileIDPrefixes; this.useWriterSchema = useWriterSchema; this.writeHandleFactory = writeHandleFactory; + this.partitioner = partitioner; } @Override public Iterator> call(Integer partition, Iterator> recordItr) { return new SparkLazyInsertIterable<>(recordItr, areRecordsSorted, config, instantTime, hoodieTable, - fileIDPrefixes.get(partition), hoodieTable.getTaskContextSupplier(), useWriterSchema, - writeHandleFactory); + partitioner.getFileIdPfx(partition), hoodieTable.getTaskContextSupplier(), useWriterSchema, + (WriteHandleFactory) partitioner.getWriteHandleFactory(partition).orElse(this.writeHandleFactory)); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java index 219fb0b165972..50a0a534f881b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java @@ -49,9 +49,9 @@ public class RDDSpatialCurveSortPartitioner implements BulkInsertPartitioner>> { - private final HoodieSparkEngineContext sparkEngineContext; + private final transient HoodieSparkEngineContext sparkEngineContext; private final String[] orderByColumns; - private final Schema schema; + private final SerializableSchema schema; private final HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy; private final HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType; @@ -64,14 +64,13 @@ public RDDSpatialCurveSortPartitioner(HoodieSparkEngineContext sparkEngineContex this.orderByColumns = orderByColumns; this.layoutOptStrategy = layoutOptStrategy; this.curveCompositionStrategyType = curveCompositionStrategyType; - this.schema = schema; + this.schema = new SerializableSchema(schema); } @Override public JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions) { - SerializableSchema serializableSchema = new SerializableSchema(schema); JavaRDD genericRecordsRDD = - records.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get()); + records.map(f -> (GenericRecord) f.getData().getInsertValue(schema.get()).get()); Dataset sourceDataset = AvroConversionUtils.createDataFrame( @@ -82,7 +81,7 @@ public JavaRDD> repartitionRecords(JavaRDD> reco Dataset sortedDataset = reorder(sourceDataset, outputSparkPartitions); - return HoodieSparkUtils.createRdd(sortedDataset, schema.getName(), schema.getNamespace(), false, Option.empty()) + return HoodieSparkUtils.createRdd(sortedDataset, schema.get().getName(), schema.get().getNamespace(), false, Option.empty()) .toJavaRDD() .map(record -> { String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java index ade550897765a..205da82ac145d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java @@ -112,10 +112,8 @@ public BaseSparkCommitActionExecutor(HoodieEngineContext context, } } - private HoodieData> clusteringHandleUpdate(HoodieData> inputRecords) { + private HoodieData> clusteringHandleUpdate(HoodieData> inputRecords, Set fileGroupsInPendingClustering) { context.setJobStatus(this.getClass().getSimpleName(), "Handling updates which are under clustering"); - Set fileGroupsInPendingClustering = - table.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getKey).collect(Collectors.toSet()); UpdateStrategy>> updateStrategy = (UpdateStrategy>>) ReflectionUtils .loadClass(config.getClusteringUpdatesStrategyClass(), this.context, fileGroupsInPendingClustering); Pair>, Set> recordsAndPendingClusteringFileGroups = @@ -166,7 +164,9 @@ public HoodieWriteMetadata> execute(HoodieData> inputRecordsWithClusteringUpdate = clusteringHandleUpdate(inputRecords); + Set fileGroupsInPendingClustering = + table.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getKey).collect(Collectors.toSet()); + HoodieData> inputRecordsWithClusteringUpdate = fileGroupsInPendingClustering.isEmpty() ? inputRecords : clusteringHandleUpdate(inputRecords, fileGroupsInPendingClustering); context.setJobStatus(this.getClass().getSimpleName(), "Doing partition and writing data"); HoodieData writeStatuses = mapPartitionsAsRDD(inputRecordsWithClusteringUpdate, partitioner); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java index 38e38101b0d02..1652c35eb63e6 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java @@ -20,7 +20,6 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.data.HoodieData; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -39,8 +38,6 @@ import org.apache.spark.api.java.JavaRDD; import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; /** * A spark implementation of {@link BaseBulkInsertHelper}. @@ -76,9 +73,12 @@ public HoodieWriteMetadata> bulkInsert(final HoodieData< table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(HoodieInstant.State.REQUESTED, executor.getCommitActionType(), instantTime), Option.empty(), config.shouldAllowMultiWriteOnSameInstant()); + + BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.orElse(BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode())); + // write new files - HoodieData writeStatuses = - bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false)); + HoodieData writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, partitioner, false, + config.getBulkInsertShuffleParallelism(), new CreateHandleFactory(false)); //update index ((BaseSparkCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatuses, result); return result; @@ -90,7 +90,7 @@ public HoodieData bulkInsert(HoodieData> inputRecor HoodieTable>, HoodieData, HoodieData> table, HoodieWriteConfig config, boolean performDedupe, - Option userDefinedBulkInsertPartitioner, + BulkInsertPartitioner partitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) { @@ -103,20 +103,12 @@ public HoodieData bulkInsert(HoodieData> inputRecor parallelism, table); } - final HoodieData> repartitionedRecords; - BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() - ? userDefinedBulkInsertPartitioner.get() - : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode()); // only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463 - repartitionedRecords = HoodieJavaRDD.of((JavaRDD>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism)); - - // generate new file ID prefixes for each output partition - final List fileIDPrefixes = - IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList()); + final HoodieData> repartitionedRecords = HoodieJavaRDD.of((JavaRDD>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism)); JavaRDD writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords) .mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, - partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes, useWriterSchema, writeHandleFactory), true) + partitioner.arePartitionRecordsSorted(), config, table, useWriterSchema, partitioner, writeHandleFactory), true) .flatMap(List::iterator); return HoodieJavaRDD.of(writeStatusRDD); diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala index 57eb32fce3623..7a8f8a1580d97 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala @@ -26,6 +26,9 @@ import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.common.config.TypedProperties import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.internal.schema.InternalSchema +import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter +import org.apache.hudi.internal.schema.utils.InternalSchemaUtils import org.apache.hudi.keygen.constant.KeyGeneratorOptions import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory import org.apache.hudi.keygen.{BaseKeyGenerator, CustomAvroKeyGenerator, CustomKeyGenerator, KeyGenerator} @@ -36,12 +39,8 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructField, StructType} -import java.util.Properties - -import org.apache.hudi.internal.schema.InternalSchema -import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter -import org.apache.hudi.internal.schema.utils.InternalSchemaUtils +import java.util.Properties import scala.collection.JavaConverters._ object HoodieSparkUtils extends SparkAdapterSupport { @@ -54,13 +53,15 @@ object HoodieSparkUtils extends SparkAdapterSupport { def isSpark3_1: Boolean = SPARK_VERSION.startsWith("3.1") + def gteqSpark3_1: Boolean = SPARK_VERSION > "3.1" + + def gteqSpark3_1_3: Boolean = SPARK_VERSION >= "3.1.3" + def isSpark3_2: Boolean = SPARK_VERSION.startsWith("3.2") def gteqSpark3_2: Boolean = SPARK_VERSION > "3.2" - def gteqSpark3_1: Boolean = SPARK_VERSION > "3.1" - - def gteqSpark3_1_3: Boolean = SPARK_VERSION >= "3.1.3" + def gteqSpark3_2_1: Boolean = SPARK_VERSION >= "3.2.1" def getMetaSchema: StructType = { StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => { @@ -130,7 +131,15 @@ object HoodieSparkUtils extends SparkAdapterSupport { */ def createRdd(df: DataFrame, structName: String, recordNamespace: String, reconcileToLatestSchema: Boolean, latestTableSchema: org.apache.hudi.common.util.Option[Schema] = org.apache.hudi.common.util.Option.empty()): RDD[GenericRecord] = { - val latestTableSchemaConverted = if (latestTableSchema.isPresent && reconcileToLatestSchema) Some(latestTableSchema.get()) else None + var latestTableSchemaConverted : Option[Schema] = None + + if (latestTableSchema.isPresent && reconcileToLatestSchema) { + latestTableSchemaConverted = Some(latestTableSchema.get()) + } else { + // cases when users want to use latestTableSchema but have not turned on reconcileToLatestSchema explicitly + // for example, when using a Transformer implementation to transform source RDD to target RDD + latestTableSchemaConverted = if (latestTableSchema.isPresent) Some(latestTableSchema.get()) else None + } createRdd(df, structName, recordNamespace, latestTableSchemaConverted) } diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala index fe30f61b92981..a3b9c210b9835 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieCatalystExpressionUtils.scala @@ -18,12 +18,30 @@ package org.apache.spark.sql import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction} -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, SubqueryExpression} +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, SubqueryExpression, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, LogicalPlan} import org.apache.spark.sql.types.StructType trait HoodieCatalystExpressionUtils { + /** + * Generates instance of [[UnsafeProjection]] projecting row of one [[StructType]] into another [[StructType]] + * + * NOTE: No safety checks are executed to validate that this projection is actually feasible, + * it's up to the caller to make sure that such projection is possible. + * + * NOTE: Projection of the row from [[StructType]] A to [[StructType]] B is only possible, if + * B is a subset of A + */ + def generateUnsafeProjection(from: StructType, to: StructType): UnsafeProjection = { + val attrs = from.toAttributes + val attrsMap = attrs.map(attr => (attr.name, attr)).toMap + val targetExprs = to.fields.map(f => attrsMap(f.name)) + + GenerateUnsafeProjection.generate(targetExprs, attrs) + } + /** * Parses and resolves expression against the attributes of the given table schema. * diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala index d8ed173547851..a97743e62fac8 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala @@ -177,7 +177,7 @@ trait SparkAdapter extends Serializable { def createResolveHudiAlterTableCommand(sparkSession: SparkSession): Rule[LogicalPlan] /** - * Create hoodie parquet file format. + * Create instance of [[ParquetFileFormat]] */ - def createHoodieParquetFileFormat(): Option[ParquetFileFormat] + def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java index 17ebccb153d2d..72749160e6bd0 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieMergeHandle.java @@ -21,13 +21,15 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.HoodieCommonConfig; -import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; @@ -36,8 +38,6 @@ import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieClientTestUtils; - -import org.apache.hadoop.fs.FileSystem; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -51,6 +51,8 @@ import java.util.ArrayList; import java.util.List; import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; import java.util.stream.Stream; import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; @@ -94,7 +96,6 @@ public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMap .withProperties(properties) .build(); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) { - FileSystem fs = FSUtils.getFs(basePath, hadoopConf); /** * Write 1 (only inserts) This will do a bulk insert of 44 records of which there are 2 records repeated 21 times @@ -202,6 +203,7 @@ public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMap // Check the entire dataset has 47 records still dataSet = getRecords(); assertEquals(47, dataSet.count(), "Must contain 47 records"); + Row[] rows = (Row[]) dataSet.collect(); int record1Count = 0; int record2Count = 0; @@ -228,6 +230,22 @@ public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMap // Assert that id2 record count which has been updated to rider-004 and driver-004 is 21, which is the total // number of records with row_key id2 assertEquals(21, record2Count); + + // Validate that all the records only reference the _latest_ base files as part of the + // FILENAME_METADATA_FIELD payload (entailing that corresponding metadata is in-sync with + // the state of the table + HoodieTableFileSystemView tableView = + getHoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline(), HoodieTestTable.of(metaClient).listAllBaseFiles()); + + Set latestBaseFileNames = tableView.getLatestBaseFiles() + .map(BaseFile::getFileName) + .collect(Collectors.toSet()); + + Set metadataFilenameFieldRefs = dataSet.collectAsList().stream() + .map(row -> row.getAs(HoodieRecord.FILENAME_METADATA_FIELD)) + .collect(Collectors.toSet()); + + assertEquals(latestBaseFileNames, metadataFilenameFieldRefs); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java index 445780384f97a..d412052c2dbbf 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java @@ -889,12 +889,19 @@ public void testArchiveCompletedRollbackAndClean(boolean isEmpty, boolean enable metaClient = HoodieTableMetaClient.reload(metaClient); int startInstant = 1; + List expectedArchivedInstants = new ArrayList<>(); for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant++) { - createCleanMetadata(startInstant + "", false, isEmpty || i % 2 == 0); + createCleanMetadata(startInstant + "", false, false, isEmpty || i % 2 == 0); + expectedArchivedInstants.add(new HoodieInstant(State.REQUESTED, HoodieTimeline.CLEAN_ACTION, startInstant + "")); + expectedArchivedInstants.add(new HoodieInstant(State.INFLIGHT, HoodieTimeline.CLEAN_ACTION, startInstant + "")); + expectedArchivedInstants.add(new HoodieInstant(State.COMPLETED, HoodieTimeline.CLEAN_ACTION, startInstant + "")); } for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant += 2) { createCommitAndRollbackFile(startInstant + 1 + "", startInstant + "", false, isEmpty || i % 2 == 0); + expectedArchivedInstants.add(new HoodieInstant(State.REQUESTED, HoodieTimeline.ROLLBACK_ACTION, startInstant + "")); + expectedArchivedInstants.add(new HoodieInstant(State.INFLIGHT, HoodieTimeline.ROLLBACK_ACTION, startInstant + "")); + expectedArchivedInstants.add(new HoodieInstant(State.COMPLETED, HoodieTimeline.ROLLBACK_ACTION, startInstant + "")); } if (enableMetadataTable) { @@ -916,6 +923,14 @@ public void testArchiveCompletedRollbackAndClean(boolean isEmpty, boolean enable assertTrue(actionInstantMap.containsKey("rollback"), "Rollback Action key must be preset"); assertEquals(minInstantsToKeep, actionInstantMap.get("rollback").size(), "Should have min instant"); + + // verify all expected instants are part of archived timeline + metaClient.getArchivedTimeline().loadCompletedInstantDetailsInMemory(); + HoodieInstant firstInstant = metaClient.reloadActiveTimeline().firstInstant().get(); + expectedArchivedInstants = expectedArchivedInstants.stream() + .filter(entry -> HoodieTimeline.compareTimestamps(entry.getTimestamp(), HoodieTimeline.LESSER_THAN, firstInstant.getTimestamp() + )).collect(Collectors.toList()); + expectedArchivedInstants.forEach(entry -> assertTrue(metaClient.getArchivedTimeline().containsInstant(entry))); } @ParameterizedTest @@ -1271,7 +1286,8 @@ private void verifyArchival(List expectedArchivedInstants, List getArchivedInstants(HoodieInstant instant) { List instants = new ArrayList<>(); - if (instant.getAction() == HoodieTimeline.COMMIT_ACTION || instant.getAction() == HoodieTimeline.DELTA_COMMIT_ACTION || instant.getAction() == HoodieTimeline.CLEAN_ACTION) { + if (instant.getAction().equals(HoodieTimeline.COMMIT_ACTION) || instant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION) + || instant.getAction().equals(HoodieTimeline.CLEAN_ACTION) || instant.getAction().equals(HoodieTimeline.ROLLBACK_ACTION)) { instants.add(new HoodieInstant(State.REQUESTED, instant.getAction(), instant.getTimestamp())); } instants.add(new HoodieInstant(State.INFLIGHT, instant.getAction(), instant.getTimestamp())); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java index 7e774c32c09f0..b8545b0f63809 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java @@ -744,7 +744,7 @@ public void testCleanEmptyInstants() throws Exception { for (int i = 0; i < cleanCount; i++, startInstant++) { String commitTime = makeNewCommitTime(startInstant, "%09d"); - createCleanMetadata(commitTime + "", false, true); + createEmptyCleanMetadata(commitTime + "", false); } int instantClean = startInstant; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java index 339e9e119ac09..043697f66b066 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java @@ -500,6 +500,94 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { } } + @Test + void testRestoreWithCleanedUpCommits() throws Exception { + boolean populateMetaFields = true; + HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false) + // Timeline-server-based markers are not used for multi-rollback tests + .withMarkersType(MarkerType.DIRECT.name()); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg = cfgBuilder.build(); + + Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + /* + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc().parallelize(records, 1); + JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); + List statuses = writeStatusJavaRDD.collect(); + assertNoWriteErrors(statuses); + client.commit(newCommitTime, jsc().parallelize(statuses)); + + upsertRecords(client, "002", records, dataGen); + + client.savepoint("002","user1","comment1"); + + upsertRecords(client, "003", records, dataGen); + upsertRecords(client, "004", records, dataGen); + + // Compaction commit + String compactionInstantTime = "006"; + client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(compactionInstantTime); + client.commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); + + upsertRecords(client, "007", records, dataGen); + upsertRecords(client, "008", records, dataGen); + + // Compaction commit + String compactionInstantTime1 = "009"; + client.scheduleCompactionAtInstant(compactionInstantTime1, Option.empty()); + HoodieWriteMetadata> compactionMetadata1 = client.compact(compactionInstantTime1); + client.commitCompaction(compactionInstantTime1, compactionMetadata1.getCommitMetadata().get(), Option.empty()); + + upsertRecords(client, "010", records, dataGen); + + // trigger clean. creating a new client with aggresive cleaner configs so that clean will kick in immediately. + cfgBuilder = getConfigBuilder(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).build()) + // Timeline-server-based markers are not used for multi-rollback tests + .withMarkersType(MarkerType.DIRECT.name()); + addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); + HoodieWriteConfig cfg1 = cfgBuilder.build(); + final SparkRDDWriteClient client1 = getHoodieWriteClient(cfg1); + client1.clean(); + client1.close(); + + metaClient = HoodieTableMetaClient.reload(metaClient); + upsertRecords(client, "011", records, dataGen); + + // Rollback to 002 + client.restoreToInstant("002", cfg.isMetadataTableEnabled()); + + // verify that no files are present after 002. every data file should have been cleaned up + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); + HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); + Stream dataFilesToRead = tableView.getLatestBaseFiles(); + assertFalse(dataFilesToRead.anyMatch(file -> HoodieTimeline.compareTimestamps("002", HoodieTimeline.GREATER_THAN, file.getCommitTime()))); + } + } + + private void upsertRecords(SparkRDDWriteClient client, String commitTime, List records, HoodieTestDataGenerator dataGen) throws IOException { + client.startCommitWithTime(commitTime); + List copyOfRecords = new ArrayList<>(records); + copyOfRecords = dataGen.generateUpdates(commitTime, copyOfRecords); + List statuses = client.upsert(jsc().parallelize(copyOfRecords, 1), commitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + client.commit(commitTime, jsc().parallelize(statuses)); + } + private long getTotalRecordsWritten(HoodieCommitMetadata commitMetadata) { return commitMetadata.getPartitionToWriteStats().values().stream() .flatMap(Collection::stream) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java index 1b41769ecc6da..4504c552c95d6 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java @@ -696,10 +696,14 @@ private void runFullValidation(HoodieMetadataConfig metadataConfig, } public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly) throws IOException { - return createCleanMetadata(instantTime, inflightOnly, false); + return createCleanMetadata(instantTime, inflightOnly, false, false); } - public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly, boolean isEmpty) throws IOException { + public HoodieInstant createEmptyCleanMetadata(String instantTime, boolean inflightOnly) throws IOException { + return createCleanMetadata(instantTime, inflightOnly, true, true); + } + + public HoodieInstant createCleanMetadata(String instantTime, boolean inflightOnly, boolean isEmptyForAll, boolean isEmptyCompleted) throws IOException { HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>(), new ArrayList<>()); if (inflightOnly) { @@ -713,7 +717,7 @@ HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEF Collections.emptyList(), instantTime); HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats)); - HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata, isEmpty); + HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata, isEmptyForAll, isEmptyCompleted); } return new HoodieInstant(inflightOnly, "clean", instantTime); } diff --git a/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire.properties b/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire.properties +++ b/hudi-client/hudi-spark-client/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-common/src/main/avro/HoodieMetadata.avsc b/hudi-common/src/main/avro/HoodieMetadata.avsc index a8d7ca72bd143..a47cbf3784f52 100644 --- a/hudi-common/src/main/avro/HoodieMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieMetadata.avsc @@ -107,7 +107,8 @@ "type": [ "null", "string" - ] + ], + "default" : null }, { "doc": "Column name for which this column statistics applies", @@ -318,7 +319,8 @@ "type": [ "null", "long" - ] + ], + "default": null }, { "doc": "Total count of null values", @@ -326,7 +328,8 @@ "type": [ "null", "long" - ] + ], + "default": null }, { "doc": "Total storage size on disk", @@ -334,7 +337,8 @@ "type": [ "null", "long" - ] + ], + "default": null }, { "doc": "Total uncompressed storage size on disk", @@ -342,7 +346,8 @@ "type": [ "null", "long" - ] + ], + "default": null }, { "doc": "Column range entry valid/deleted flag", diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java new file mode 100644 index 0000000000000..dd14dca671b39 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/avro/AvroSchemaUtils.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.Schema; + +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; + +public class AvroSchemaUtils { + + private AvroSchemaUtils() {} + + /** + * Appends provided new fields at the end of the given schema + * + * NOTE: No deduplication is made, this method simply appends fields at the end of the list + * of the source schema as is + */ + public static Schema appendFieldsToSchema(Schema schema, List newFields) { + List fields = schema.getFields().stream() + .map(field -> new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal())) + .collect(Collectors.toList()); + fields.addAll(newFields); + + Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); + newSchema.setFields(fields); + return newSchema; + } + + /** + * Passed in {@code Union} schema and will try to resolve the field with the {@code fieldSchemaFullName} + * w/in the union returning its corresponding schema + * + * @param schema target schema to be inspected + * @param fieldSchemaFullName target field-name to be looked up w/in the union + * @return schema of the field w/in the union identified by the {@code fieldSchemaFullName} + */ + public static Schema resolveUnionSchema(Schema schema, String fieldSchemaFullName) { + if (schema.getType() != Schema.Type.UNION) { + return schema; + } + + List innerTypes = schema.getTypes(); + Schema nonNullType = + innerTypes.stream() + .filter(it -> it.getType() != Schema.Type.NULL && Objects.equals(it.getFullName(), fieldSchemaFullName)) + .findFirst() + .orElse(null); + + if (nonNullType == null) { + throw new AvroRuntimeException( + String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); + } + + return nonNullType; + } + + /** + * Resolves typical Avro's nullable schema definition: {@code Union(Schema.Type.NULL, )}, + * decomposing union and returning the target non-null type + */ + public static Schema resolveNullableSchema(Schema schema) { + if (schema.getType() != Schema.Type.UNION) { + return schema; + } + + List innerTypes = schema.getTypes(); + Schema nonNullType = + innerTypes.stream() + .filter(it -> it.getType() != Schema.Type.NULL) + .findFirst() + .orElse(null); + + if (innerTypes.size() != 2 || nonNullType == null) { + throw new AvroRuntimeException( + String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); + } + + return nonNullType; + } + + /** + * Creates schema following Avro's typical nullable schema definition: {@code Union(Schema.Type.NULL, )}, + * wrapping around provided target non-null type + */ + public static Schema createNullableSchema(Schema.Type avroType) { + checkState(avroType != Schema.Type.NULL); + return Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(avroType)); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index 1055bd522022c..f69d5683d1cfb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -18,8 +18,18 @@ package org.apache.hudi.avro; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.SchemaCompatibilityException; + import org.apache.avro.AvroRuntimeException; -import org.apache.avro.SchemaCompatibility; import org.apache.avro.Conversions; import org.apache.avro.Conversions.DecimalConversion; import org.apache.avro.JsonProperties; @@ -27,6 +37,7 @@ import org.apache.avro.LogicalTypes.Decimal; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; +import org.apache.avro.SchemaCompatibility; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; @@ -42,16 +53,6 @@ import org.apache.avro.io.JsonDecoder; import org.apache.avro.io.JsonEncoder; import org.apache.avro.specific.SpecificRecordBase; -import org.apache.hudi.common.config.SerializableSchema; -import org.apache.hudi.common.model.HoodieOperation; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.SchemaCompatibilityException; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -64,19 +65,21 @@ import java.sql.Timestamp; import java.time.LocalDate; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Objects; -import java.util.HashMap; +import java.util.Deque; +import java.util.LinkedList; import java.util.TimeZone; -import java.util.Iterator; - import java.util.stream.Collectors; import static org.apache.avro.Schema.Type.UNION; +import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.resolveUnionSchema; /** * Helper class to do common stuff across Avro. @@ -97,8 +100,7 @@ public class HoodieAvroUtils { private static final String MASK_FOR_INVALID_CHARS_IN_NAMES = "__"; // All metadata fields are optional strings. - public static final Schema METADATA_FIELD_SCHEMA = - Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING))); + public static final Schema METADATA_FIELD_SCHEMA = createNullableSchema(Schema.Type.STRING); public static final Schema RECORD_KEY_SCHEMA = initRecordKeySchema(); @@ -327,31 +329,6 @@ public static GenericRecord addOperationToRecord(GenericRecord record, HoodieOpe return record; } - /** - * Add null fields to passed in schema. Caller is responsible for ensuring there is no duplicates. As different query - * engines have varying constraints regarding treating the case-sensitivity of fields, its best to let caller - * determine that. - * - * @param schema Passed in schema - * @param newFieldNames Null Field names to be added - */ - public static Schema appendNullSchemaFields(Schema schema, List newFieldNames) { - List newFields = new ArrayList<>(); - for (String newField : newFieldNames) { - newFields.add(new Schema.Field(newField, METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE)); - } - return createNewSchemaWithExtraFields(schema, newFields); - } - - public static Schema createNewSchemaWithExtraFields(Schema schema, List newFields) { - List fields = schema.getFields().stream() - .map(field -> new Field(field.name(), field.schema(), field.doc(), field.defaultVal())).collect(Collectors.toList()); - fields.addAll(newFields); - Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); - newSchema.setFields(fields); - return newSchema; - } - /** * Adds the Hoodie commit metadata into the provided Generic Record. */ @@ -431,6 +408,14 @@ public static GenericRecord rewriteRecordWithMetadata(GenericRecord genericRecor return newRecord; } + // TODO Unify the logical of rewriteRecordWithMetadata and rewriteEvolutionRecordWithMetadata, and delete this function. + public static GenericRecord rewriteEvolutionRecordWithMetadata(GenericRecord genericRecord, Schema newSchema, String fileName) { + GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(genericRecord, newSchema, new HashMap<>()); + // do not preserve FILENAME_METADATA_FIELD + newRecord.put(HoodieRecord.FILENAME_METADATA_FIELD_POS, fileName); + return newRecord; + } + /** * Converts list of {@link GenericRecord} provided into the {@link GenericRecord} adhering to the * provided {@code newSchema}. @@ -441,6 +426,17 @@ public static List rewriteRecords(List records, Sc return records.stream().map(r -> rewriteRecord(r, newSchema)).collect(Collectors.toList()); } + /** + * Given an Avro record and list of columns to remove, this method removes the list of columns from + * the given avro record using rewriteRecord method. + *

+ * To better understand how it removes please check {@link #rewriteRecord(GenericRecord, Schema)} + */ + public static GenericRecord removeFields(GenericRecord record, List fieldsToRemove) { + Schema newSchema = removeFields(record.getSchema(), fieldsToRemove); + return rewriteRecord(record, newSchema); + } + private static void copyOldValueOrSetDefault(GenericRecord oldRecord, GenericRecord newRecord, Schema.Field field) { Schema oldSchema = oldRecord.getSchema(); Object fieldValue = oldSchema.getField(field.name()) == null ? null : oldRecord.get(field.name()); @@ -485,6 +481,17 @@ public static Schema generateProjectionSchema(Schema originalSchema, List innerTypes = schema.getTypes(); - Schema nonNullType = - innerTypes.stream() - .filter(it -> it.getType() != Schema.Type.NULL && Objects.equals(it.getFullName(), fieldSchemaFullName)) - .findFirst() - .orElse(null); - - if (nonNullType == null) { - throw new AvroRuntimeException( - String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); - } - - return nonNullType; - } - - public static Schema resolveNullableSchema(Schema schema) { - if (schema.getType() != Schema.Type.UNION) { - return schema; - } - - List innerTypes = schema.getTypes(); - Schema nonNullType = - innerTypes.stream() - .filter(it -> it.getType() != Schema.Type.NULL) - .findFirst() - .orElse(null); - - if (innerTypes.size() != 2 || nonNullType == null) { - throw new AvroRuntimeException( - String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); - } - - return nonNullType; - } - /** * Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new schema. * support deep rewrite for nested record. @@ -774,14 +741,28 @@ public static Schema resolveNullableSchema(Schema schema) { * * @param oldRecord oldRecord to be rewritten * @param newSchema newSchema used to rewrite oldRecord + * @param renameCols a map store all rename cols, (k, v)-> (colNameFromNewSchema, colNameFromOldSchema) * @return newRecord for new Schema */ - public static GenericRecord rewriteRecordWithNewSchema(IndexedRecord oldRecord, Schema newSchema) { - Object newRecord = rewriteRecordWithNewSchema(oldRecord, oldRecord.getSchema(), newSchema); + public static GenericRecord rewriteRecordWithNewSchema(IndexedRecord oldRecord, Schema newSchema, Map renameCols) { + Object newRecord = rewriteRecordWithNewSchema(oldRecord, oldRecord.getSchema(), newSchema, renameCols, new LinkedList<>()); return (GenericData.Record) newRecord; } - private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSchema, Schema newSchema) { + /** + * Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new schema. + * support deep rewrite for nested record and adjust rename operation. + * This particular method does the following things : + * a) Create a new empty GenericRecord with the new schema. + * b) For GenericRecord, copy over the data from the old schema to the new schema or set default values for all fields of this transformed schema + * + * @param oldRecord oldRecord to be rewritten + * @param newSchema newSchema used to rewrite oldRecord + * @param renameCols a map store all rename cols, (k, v)-> (colNameFromNewSchema, colNameFromOldSchema) + * @param fieldNames track the full name of visited field when we travel new schema. + * @return newRecord for new Schema + */ + private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSchema, Schema newSchema, Map renameCols, Deque fieldNames) { if (oldRecord == null) { return null; } @@ -796,10 +777,23 @@ private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSch for (int i = 0; i < fields.size(); i++) { Schema.Field field = fields.get(i); + String fieldName = field.name(); + fieldNames.push(fieldName); if (oldSchema.getField(field.name()) != null) { Schema.Field oldField = oldSchema.getField(field.name()); - helper.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema())); + helper.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames)); + } else { + String fieldFullName = createFullName(fieldNames); + String[] colNamePartsFromOldSchema = renameCols.getOrDefault(fieldFullName, "").split("\\."); + String lastColNameFromOldSchema = colNamePartsFromOldSchema[colNamePartsFromOldSchema.length - 1]; + // deal with rename + if (oldSchema.getField(field.name()) == null && oldSchema.getField(lastColNameFromOldSchema) != null) { + // find rename + Schema.Field oldField = oldSchema.getField(lastColNameFromOldSchema); + helper.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames)); + } } + fieldNames.pop(); } GenericData.Record newRecord = new GenericData.Record(newSchema); for (int i = 0; i < fields.size(); i++) { @@ -820,9 +814,11 @@ private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSch } Collection array = (Collection)oldRecord; List newArray = new ArrayList(); + fieldNames.push("element"); for (Object element : array) { - newArray.add(rewriteRecordWithNewSchema(element, oldSchema.getElementType(), newSchema.getElementType())); + newArray.add(rewriteRecordWithNewSchema(element, oldSchema.getElementType(), newSchema.getElementType(), renameCols, fieldNames)); } + fieldNames.pop(); return newArray; case MAP: if (!(oldRecord instanceof Map)) { @@ -830,17 +826,29 @@ private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSch } Map map = (Map) oldRecord; Map newMap = new HashMap<>(); + fieldNames.push("value"); for (Map.Entry entry : map.entrySet()) { - newMap.put(entry.getKey(), rewriteRecordWithNewSchema(entry.getValue(), oldSchema.getValueType(), newSchema.getValueType())); + newMap.put(entry.getKey(), rewriteRecordWithNewSchema(entry.getValue(), oldSchema.getValueType(), newSchema.getValueType(), renameCols, fieldNames)); } + fieldNames.pop(); return newMap; case UNION: - return rewriteRecordWithNewSchema(oldRecord, getActualSchemaFromUnion(oldSchema, oldRecord), getActualSchemaFromUnion(newSchema, oldRecord)); + return rewriteRecordWithNewSchema(oldRecord, getActualSchemaFromUnion(oldSchema, oldRecord), getActualSchemaFromUnion(newSchema, oldRecord), renameCols, fieldNames); default: return rewritePrimaryType(oldRecord, oldSchema, newSchema); } } + private static String createFullName(Deque fieldNames) { + String result = ""; + if (!fieldNames.isEmpty()) { + List parentNames = new ArrayList<>(); + fieldNames.descendingIterator().forEachRemaining(parentNames::add); + result = parentNames.stream().collect(Collectors.joining(".")); + } + return result; + } + private static Object rewritePrimaryType(Object oldValue, Schema oldSchema, Schema newSchema) { Schema realOldSchema = oldSchema; if (realOldSchema.getType() == UNION) { @@ -1013,9 +1021,10 @@ private static Schema getActualSchemaFromUnion(Schema schema, Object data) { * * @param oldRecords oldRecords to be rewrite * @param newSchema newSchema used to rewrite oldRecord + * @param renameCols a map store all rename cols, (k, v)-> (colNameFromNewSchema, colNameFromOldSchema) * @return a iterator of rewrote GeneriRcords */ - public static Iterator rewriteRecordWithNewSchema(Iterator oldRecords, Schema newSchema) { + public static Iterator rewriteRecordWithNewSchema(Iterator oldRecords, Schema newSchema, Map renameCols) { if (oldRecords == null || newSchema == null) { return Collections.emptyIterator(); } @@ -1027,7 +1036,7 @@ public boolean hasNext() { @Override public GenericRecord next() { - return rewriteRecordWithNewSchema(oldRecords.next(), newSchema); + return rewriteRecordWithNewSchema(oldRecords.next(), newSchema, renameCols); } }; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/DFSPropertiesConfiguration.java b/hudi-common/src/main/java/org/apache/hudi/common/config/DFSPropertiesConfiguration.java index afce8f6ec4546..e4176499487ee 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/DFSPropertiesConfiguration.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/DFSPropertiesConfiguration.java @@ -99,7 +99,11 @@ public static TypedProperties loadGlobalProps() { if (defaultConfPath.isPresent()) { conf.addPropsFromFile(defaultConfPath.get()); } else { - conf.addPropsFromFile(DEFAULT_PATH); + try { + conf.addPropsFromFile(DEFAULT_PATH); + } catch (Exception e) { + LOG.warn("Cannot load default config file: " + DEFAULT_PATH, e); + } } return conf.getProps(); } @@ -126,13 +130,17 @@ public void addPropsFromFile(Path filePath) { filePath.toString(), Option.ofNullable(hadoopConfig).orElseGet(Configuration::new) ); + try { if (filePath.equals(DEFAULT_PATH) && !fs.exists(filePath)) { LOG.warn("Properties file " + filePath + " not found. Ignoring to load props file"); return; } + } catch (IOException ioe) { + throw new HoodieIOException("Cannot check if the properties file exist: " + filePath, ioe); + } - BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(filePath))); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(filePath)))) { visitedFilePaths.add(filePath.toString()); currentFilePath = filePath; addPropsFromStream(reader); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index 1bde88d3bb647..79badb48a5895 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -96,22 +96,26 @@ public static Configuration prepareHadoopConf(Configuration conf) { return conf; } - public static FileSystem getFs(String path, Configuration conf) { + public static FileSystem getFs(String pathStr, Configuration conf) { + return getFs(new Path(pathStr), conf); + } + + public static FileSystem getFs(Path path, Configuration conf) { FileSystem fs; prepareHadoopConf(conf); try { - fs = new Path(path).getFileSystem(conf); + fs = path.getFileSystem(conf); } catch (IOException e) { throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), e); } return fs; } - public static FileSystem getFs(String path, Configuration conf, boolean localByDefault) { + public static FileSystem getFs(String pathStr, Configuration conf, boolean localByDefault) { if (localByDefault) { - return getFs(addSchemeIfLocalPath(path).toString(), conf); + return getFs(addSchemeIfLocalPath(pathStr), conf); } - return getFs(path, conf); + return getFs(pathStr, conf); } /** @@ -178,7 +182,7 @@ public static String getCommitFromCommitFile(String commitFileName) { } public static String getCommitTime(String fullFileName) { - if (isLogFile(new Path(fullFileName))) { + if (isLogFile(fullFileName)) { return fullFileName.split("_")[1].split("\\.")[0]; } return fullFileName.split("_")[2].split("\\.")[0]; @@ -461,8 +465,12 @@ public static boolean isBaseFile(Path path) { } public static boolean isLogFile(Path logPath) { - Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName()); - return matcher.find() && logPath.getName().contains(".log"); + return isLogFile(logPath.getName()); + } + + public static boolean isLogFile(String fileName) { + Matcher matcher = LOG_FILE_PATTERN.matcher(fileName); + return matcher.find() && fileName.contains(".log"); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java index 4bbd94384420d..a79d1571afe73 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java @@ -48,7 +48,7 @@ import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.Progressable; -import org.apache.hudi.hadoop.FileNameCachingPath; +import org.apache.hudi.hadoop.CachingPath; import java.io.IOException; import java.net.URI; @@ -142,7 +142,7 @@ public static Path convertPathWithScheme(Path oldPath, String newScheme) { try { newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment()); - return new FileNameCachingPath(newURI); + return new CachingPath(newURI); } catch (URISyntaxException e) { // TODO - Better Exception handling throw new RuntimeException(e); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseFile.java b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseFile.java index f12c207ee75b6..cd35861b7499e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseFile.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseFile.java @@ -31,26 +31,35 @@ public class BaseFile implements Serializable { private static final long serialVersionUID = 1L; + private transient FileStatus fileStatus; private final String fullPath; + private final String fileName; private long fileLen; public BaseFile(BaseFile dataFile) { - this.fileStatus = dataFile.fileStatus; - this.fullPath = dataFile.fullPath; - this.fileLen = dataFile.fileLen; + this(dataFile.fileStatus, + dataFile.fullPath, + dataFile.getFileName(), + dataFile.getFileLen()); } public BaseFile(FileStatus fileStatus) { - this.fileStatus = fileStatus; - this.fullPath = fileStatus.getPath().toString(); - this.fileLen = fileStatus.getLen(); + this(fileStatus, + fileStatus.getPath().toString(), + fileStatus.getPath().getName(), + fileStatus.getLen()); } public BaseFile(String filePath) { - this.fileStatus = null; - this.fullPath = filePath; - this.fileLen = -1; + this(null, filePath, getFileName(filePath), -1); + } + + private BaseFile(FileStatus fileStatus, String fullPath, String fileName, long fileLen) { + this.fileStatus = fileStatus; + this.fullPath = fullPath; + this.fileLen = fileLen; + this.fileName = fileName; } public String getPath() { @@ -58,7 +67,7 @@ public String getPath() { } public String getFileName() { - return new Path(fullPath).getName(); + return fileName; } public FileStatus getFileStatus() { @@ -98,4 +107,8 @@ public int hashCode() { public String toString() { return "BaseFile{fullPath=" + fullPath + ", fileLen=" + fileLen + '}'; } + + private static String getFileName(String fullPath) { + return new Path(fullPath).getName(); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java index 93e9ea5d3433a..89bad1c33f599 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodiePartitionMetadata.java @@ -213,7 +213,7 @@ private boolean readTextFormatMetaFile() { format = Option.empty(); return true; } catch (Throwable t) { - LOG.warn("Unable to read partition meta properties file for partition " + partitionPath, t); + LOG.debug("Unable to read partition meta properties file for partition " + partitionPath); return false; } } @@ -229,8 +229,7 @@ private boolean readBaseFormatMetaFile() { format = Option.of(reader.getFormat()); return true; } catch (Throwable t) { - // any error, log, check the next base format - LOG.warn("Unable to read partition metadata " + metafilePath.getName() + " for partition " + partitionPath, t); + LOG.debug("Unable to read partition metadata " + metafilePath.getName() + " for partition " + partitionPath); } } return false; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index 254044bd28371..edc6caa5bcbdf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -193,7 +193,7 @@ public class HoodieTableConfig extends HoodieConfig { public static final ConfigProperty PARTITION_METAFILE_USE_BASE_FORMAT = ConfigProperty .key("hoodie.partition.metafile.use.base.format") .defaultValue(false) - .withDocumentation("If true, partition metafiles are saved in the same format as basefiles for this dataset (e.g. Parquet / ORC). " + .withDocumentation("If true, partition metafiles are saved in the same format as base-files for this dataset (e.g. Parquet / ORC). " + "If false (default) partition metafiles are saved as properties files."); public static final ConfigProperty DROP_PARTITION_COLUMNS = ConfigProperty @@ -607,7 +607,7 @@ public String getUrlEncodePartitioning() { return getString(URL_ENCODE_PARTITIONING); } - public Boolean isDropPartitionColumns() { + public Boolean shouldDropPartitionColumns() { return getBooleanOrDefault(DROP_PARTITION_COLUMNS); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index 38b5509cd577f..251a990d87c04 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -30,7 +30,6 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTimelineTimeZone; -import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -49,6 +48,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; +import org.apache.hudi.hadoop.CachingPath; +import org.apache.hudi.hadoop.SerializablePath; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -84,7 +85,6 @@ public class HoodieTableMetaClient implements Serializable { public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap"; public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat"; public static final String METADATA_TABLE_FOLDER_PATH = METAFOLDER_NAME + Path.SEPARATOR + "metadata"; - public static final String COLUMN_STATISTICS_INDEX_NAME = ".colstatsindex"; public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR + ".partitions"; public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR @@ -94,9 +94,13 @@ public class HoodieTableMetaClient implements Serializable { public static final String MARKER_EXTN = ".marker"; - private String basePath; + // NOTE: Since those two parameters lay on the hot-path of a lot of computations, we + // use tailored extension of the {@code Path} class allowing to avoid repetitive + // computations secured by its immutability + private SerializablePath basePath; + private SerializablePath metaPath; + private transient HoodieWrapperFileSystem fs; - private String metaPath; private boolean loadActiveTimelineOnLoad; private SerializableConfiguration hadoopConf; private HoodieTableType tableType; @@ -114,13 +118,11 @@ private HoodieTableMetaClient(Configuration conf, String basePath, boolean loadA this.consistencyGuardConfig = consistencyGuardConfig; this.fileSystemRetryConfig = fileSystemRetryConfig; this.hadoopConf = new SerializableConfiguration(conf); - Path basePathDir = new Path(basePath); - this.basePath = basePathDir.toString(); - this.metaPath = new Path(basePath, METAFOLDER_NAME).toString(); - Path metaPathDir = new Path(this.metaPath); + this.basePath = new SerializablePath(new CachingPath(basePath)); + this.metaPath = new SerializablePath(new CachingPath(basePath, METAFOLDER_NAME)); this.fs = getFs(); - TableNotFoundException.checkTableValidity(fs, basePathDir, metaPathDir); - this.tableConfig = new HoodieTableConfig(fs, metaPath, payloadClassName); + TableNotFoundException.checkTableValidity(fs, this.basePath.get(), metaPath.get()); + this.tableConfig = new HoodieTableConfig(fs, metaPath.toString(), payloadClassName); this.tableType = tableConfig.getTableType(); Option tableConfigVersion = tableConfig.getTimelineLayoutVersion(); if (layoutVersion.isPresent() && tableConfigVersion.isPresent()) { @@ -147,8 +149,13 @@ private HoodieTableMetaClient(Configuration conf, String basePath, boolean loadA public HoodieTableMetaClient() {} public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) { - return HoodieTableMetaClient.builder().setConf(oldMetaClient.hadoopConf.get()).setBasePath(oldMetaClient.basePath).setLoadActiveTimelineOnLoad(oldMetaClient.loadActiveTimelineOnLoad) - .setConsistencyGuardConfig(oldMetaClient.consistencyGuardConfig).setLayoutVersion(Option.of(oldMetaClient.timelineLayoutVersion)).setPayloadClassName(null) + return HoodieTableMetaClient.builder() + .setConf(oldMetaClient.hadoopConf.get()) + .setBasePath(oldMetaClient.basePath.toString()) + .setLoadActiveTimelineOnLoad(oldMetaClient.loadActiveTimelineOnLoad) + .setConsistencyGuardConfig(oldMetaClient.consistencyGuardConfig) + .setLayoutVersion(Option.of(oldMetaClient.timelineLayoutVersion)) + .setPayloadClassName(null) .setFileSystemRetryConfig(oldMetaClient.fileSystemRetryConfig).build(); } @@ -159,6 +166,7 @@ public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) */ private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); + fs = null; // will be lazily initialized } @@ -166,11 +174,20 @@ private void writeObject(java.io.ObjectOutputStream out) throws IOException { out.defaultWriteObject(); } + /** + * Returns base path of the table + */ + public Path getBasePathV2() { + return basePath.get(); + } + /** * @return Base path + * @deprecated please use {@link #getBasePathV2()} */ + @Deprecated public String getBasePath() { - return basePath; + return basePath.get().toString(); // this invocation is cached } /** @@ -184,21 +201,14 @@ public HoodieTableType getTableType() { * @return Meta path */ public String getMetaPath() { - return metaPath; - } - - /** - * @return Column Statistics index path - */ - public String getColumnStatsIndexPath() { - return new Path(metaPath, COLUMN_STATISTICS_INDEX_NAME).toString(); + return metaPath.get().toString(); // this invocation is cached } /** * @return schema folder path */ public String getSchemaFolderName() { - return new Path(metaPath, SCHEMA_FOLDER_NAME).toString(); + return new Path(metaPath.get(), SCHEMA_FOLDER_NAME).toString(); } /** @@ -270,7 +280,7 @@ public TimelineLayoutVersion getTimelineLayoutVersion() { */ public HoodieWrapperFileSystem getFs() { if (fs == null) { - FileSystem fileSystem = FSUtils.getFs(metaPath, hadoopConf.newCopy()); + FileSystem fileSystem = FSUtils.getFs(metaPath.get(), hadoopConf.newCopy()); if (fileSystemRetryConfig.isFileSystemActionRetryEnable()) { fileSystem = new HoodieRetryWrapperFileSystem(fileSystem, @@ -366,16 +376,15 @@ public HoodieArchivedTimeline getArchivedTimeline(String startTs) { /** * Validate table properties. * @param properties Properties from writeConfig. - * @param operationType operation type to be executed. */ - public void validateTableProperties(Properties properties, WriteOperationType operationType) { - // once meta fields are disabled, it cant be re-enabled for a given table. + public void validateTableProperties(Properties properties) { + // Once meta fields are disabled, it cant be re-enabled for a given table. if (!getTableConfig().populateMetaFields() && Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()))) { throw new HoodieException(HoodieTableConfig.POPULATE_META_FIELDS.key() + " already disabled for the table. Can't be re-enabled back"); } - // meta fields can be disabled only with SimpleKeyGenerator + // Meta fields can be disabled only when {@code SimpleKeyGenerator} is used if (!getTableConfig().populateMetaFields() && !properties.getProperty(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key(), "org.apache.hudi.keygen.SimpleKeyGenerator") .equals("org.apache.hudi.keygen.SimpleKeyGenerator")) { @@ -437,8 +446,7 @@ public static HoodieTableMetaClient initTableAndGetMetaClient(Configuration hado return metaClient; } - public static void initializeBootstrapDirsIfNotExists(Configuration hadoopConf, - String basePath, FileSystem fs) throws IOException { + public static void initializeBootstrapDirsIfNotExists(Configuration hadoopConf, String basePath, FileSystem fs) throws IOException { // Create bootstrap index by partition folder if it does not exist final Path bootstrap_index_folder_by_partition = @@ -542,7 +550,7 @@ public String getCommitActionType() { */ public List scanHoodieInstantsFromFileSystem(Set includedExtensions, boolean applyLayoutVersionFilters) throws IOException { - return scanHoodieInstantsFromFileSystem(new Path(metaPath), includedExtensions, applyLayoutVersionFilters); + return scanHoodieInstantsFromFileSystem(metaPath.get(), includedExtensions, applyLayoutVersionFilters); } /** @@ -599,19 +607,7 @@ public String toString() { } public void initializeBootstrapDirsIfNotExists() throws IOException { - initializeBootstrapDirsIfNotExists(getHadoopConf(), basePath, getFs()); - } - - public void setBasePath(String basePath) { - this.basePath = basePath; - } - - public void setMetaPath(String metaPath) { - this.metaPath = metaPath; - } - - public void setActiveTimeline(HoodieActiveTimeline activeTimeline) { - this.activeTimeline = activeTimeline; + initializeBootstrapDirsIfNotExists(getHadoopConf(), basePath.toString(), getFs()); } public static Builder builder() { @@ -700,7 +696,9 @@ public static class PropertyBuilder { private Boolean urlEncodePartitioning; private HoodieTimelineTimeZone commitTimeZone; private Boolean partitionMetafileUseBaseFormat; - private Boolean dropPartitionColumnsWhenWrite; + private Boolean shouldDropPartitionColumns; + private String metadataPartitions; + private String inflightMetadataPartitions; /** * Persist the configs that is written at the first time, and should not be changed. @@ -820,8 +818,18 @@ public PropertyBuilder setPartitionMetafileUseBaseFormat(Boolean useBaseFormat) return this; } - public PropertyBuilder setDropPartitionColumnsWhenWrite(Boolean dropPartitionColumnsWhenWrite) { - this.dropPartitionColumnsWhenWrite = dropPartitionColumnsWhenWrite; + public PropertyBuilder setShouldDropPartitionColumns(Boolean shouldDropPartitionColumns) { + this.shouldDropPartitionColumns = shouldDropPartitionColumns; + return this; + } + + public PropertyBuilder setMetadataPartitions(String partitions) { + this.metadataPartitions = partitions; + return this; + } + + public PropertyBuilder setInflightMetadataPartitions(String partitions) { + this.inflightMetadataPartitions = partitions; return this; } @@ -923,9 +931,14 @@ public PropertyBuilder fromProperties(Properties properties) { if (hoodieConfig.contains(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT)) { setPartitionMetafileUseBaseFormat(hoodieConfig.getBoolean(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT)); } - if (hoodieConfig.contains(HoodieTableConfig.DROP_PARTITION_COLUMNS)) { - setDropPartitionColumnsWhenWrite(hoodieConfig.getBoolean(HoodieTableConfig.DROP_PARTITION_COLUMNS)); + setShouldDropPartitionColumns(hoodieConfig.getBoolean(HoodieTableConfig.DROP_PARTITION_COLUMNS)); + } + if (hoodieConfig.contains(HoodieTableConfig.TABLE_METADATA_PARTITIONS)) { + setMetadataPartitions(hoodieConfig.getString(HoodieTableConfig.TABLE_METADATA_PARTITIONS)); + } + if (hoodieConfig.contains(HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT)) { + setInflightMetadataPartitions(hoodieConfig.getString(HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT)); } return this; } @@ -1008,9 +1021,14 @@ public Properties build() { if (null != partitionMetafileUseBaseFormat) { tableConfig.setValue(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT, partitionMetafileUseBaseFormat.toString()); } - - if (null != dropPartitionColumnsWhenWrite) { - tableConfig.setValue(HoodieTableConfig.DROP_PARTITION_COLUMNS, Boolean.toString(dropPartitionColumnsWhenWrite)); + if (null != shouldDropPartitionColumns) { + tableConfig.setValue(HoodieTableConfig.DROP_PARTITION_COLUMNS, Boolean.toString(shouldDropPartitionColumns)); + } + if (null != metadataPartitions) { + tableConfig.setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS, metadataPartitions); + } + if (null != inflightMetadataPartitions) { + tableConfig.setValue(HoodieTableConfig.TABLE_METADATA_PARTITIONS_INFLIGHT, inflightMetadataPartitions); } return tableConfig.getProps(); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java index 262157a8aefa7..f178a23eeec7a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java @@ -23,11 +23,9 @@ import org.apache.avro.Schema.Field; import org.apache.avro.SchemaCompatibility; import org.apache.avro.generic.IndexedRecord; - import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.hfile.CacheConfig; - import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; @@ -47,15 +45,13 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIncompatibleSchemaException; import org.apache.hudi.exception.InvalidTableException; -import org.apache.hudi.io.storage.HoodieHFileReader; -import org.apache.hudi.io.storage.HoodieOrcReader; import org.apache.hudi.internal.schema.InternalSchema; import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager; import org.apache.hudi.internal.schema.utils.SerDeHelper; - +import org.apache.hudi.io.storage.HoodieHFileReader; +import org.apache.hudi.io.storage.HoodieOrcReader; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; - import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.ParquetFileReader; @@ -67,6 +63,9 @@ import java.util.Arrays; import java.util.List; +import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema; + /** * Helper class to read schema from data files and log files and to convert it between different formats. * @@ -189,7 +188,7 @@ public Schema getTableAvroSchema(boolean includeMetadataFields) throws Exception } Option partitionFieldsOpt = metaClient.getTableConfig().getPartitionFields(); - if (metaClient.getTableConfig().isDropPartitionColumns()) { + if (metaClient.getTableConfig().shouldDropPartitionColumns()) { schema = recreateSchemaWhenDropPartitionColumns(partitionFieldsOpt, schema); } return schema; @@ -222,9 +221,9 @@ public static Schema recreateSchemaWhenDropPartitionColumns(Option par List newFields = new ArrayList<>(); for (String partitionField: partitionFields) { newFields.add(new Schema.Field( - partitionField, Schema.create(Schema.Type.STRING), "", JsonProperties.NULL_VALUE)); + partitionField, createNullableSchema(Schema.Type.STRING), "", JsonProperties.NULL_VALUE)); } - schema = HoodieAvroUtils.createNewSchemaWithExtraFields(schema, newFields); + schema = appendFieldsToSchema(schema, newFields); } } return schema; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java index 9e56083b262e0..9687136444eeb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java @@ -58,6 +58,7 @@ import java.util.ArrayDeque; import java.util.Arrays; import java.util.Deque; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -379,7 +380,7 @@ private void processDataBlock(HoodieDataBlock dataBlock, Option keySpec Option schemaOption = getMergedSchema(dataBlock); while (recordIterator.hasNext()) { IndexedRecord currentRecord = recordIterator.next(); - IndexedRecord record = schemaOption.isPresent() ? HoodieAvroUtils.rewriteRecordWithNewSchema(currentRecord, schemaOption.get()) : currentRecord; + IndexedRecord record = schemaOption.isPresent() ? HoodieAvroUtils.rewriteRecordWithNewSchema(currentRecord, schemaOption.get(), new HashMap<>()) : currentRecord; processNextRecord(createHoodieRecord(record, this.hoodieTableMetaClient.getTableConfig(), this.payloadClassFQN, this.preCombineField, this.withOperationField, this.simpleKeyGenFields, this.partitionName)); totalLogRecords.incrementAndGet(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java index ed18736443288..e3d8554d00fd8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.HoodieRecordSizeEstimator; import org.apache.hudi.common.util.HoodieTimer; @@ -37,6 +38,8 @@ import org.apache.avro.Schema; import org.apache.hadoop.fs.FileSystem; import org.apache.hudi.internal.schema.InternalSchema; + +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -45,6 +48,7 @@ import java.util.List; import java.util.Map; +import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; import static org.apache.hudi.common.util.ValidationUtils.checkState; /** @@ -310,6 +314,9 @@ public Builder withPartition(String partitionName) { @Override public HoodieMergedLogRecordScanner build() { + if (this.partitionName == null && CollectionUtils.nonEmpty(this.logFilePaths)) { + this.partitionName = getRelativePartitionPath(new Path(basePath), new Path(this.logFilePaths.get(0)).getParent()); + } return new HoodieMergedLogRecordScanner(fs, basePath, logFilePaths, readerSchema, latestInstantTime, maxMemorySizeInBytes, readBlocksLazily, reverseReader, bufferSize, spillableMapBasePath, instantRange, diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java index 208d7ef2ba456..c6e618ac4769a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java @@ -95,7 +95,7 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV private BootstrapIndex bootstrapIndex; private String getPartitionPathFromFilePath(String fullPath) { - return FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), new Path(fullPath).getParent()); + return FSUtils.getRelativePartitionPath(metaClient.getBasePathV2(), new Path(fullPath).getParent()); } /** @@ -172,7 +172,7 @@ protected List buildFileGroups(Stream baseFileS Map, List> logFiles = logFileStream.collect(Collectors.groupingBy((logFile) -> { String partitionPathStr = - FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), logFile.getPath().getParent()); + FSUtils.getRelativePartitionPath(metaClient.getBasePathV2(), logFile.getPath().getParent()); return Pair.of(partitionPathStr, logFile.getFileId()); })); @@ -299,7 +299,7 @@ private void ensurePartitionLoadedCorrectly(String partition) { try { LOG.info("Building file system view for partition (" + partitionPathStr + ")"); - Path partitionPath = FSUtils.getPartitionPath(metaClient.getBasePath(), partitionPathStr); + Path partitionPath = FSUtils.getPartitionPath(metaClient.getBasePathV2(), partitionPathStr); long beginLsTs = System.currentTimeMillis(); FileStatus[] statuses = listPartition(partitionPath); long endLsTs = System.currentTimeMillis(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java index e2342edc3a351..63f10855bad84 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java @@ -78,7 +78,7 @@ public class FileSystemViewStorageConfig extends HoodieConfig { public static final ConfigProperty SPILLABLE_MEMORY = ConfigProperty .key("hoodie.filesystem.view.spillable.mem") .defaultValue(100 * 1024 * 1024L) // 100 MB - .withDocumentation("Amount of memory to be used for holding file system view, before spilling to disk."); + .withDocumentation("Amount of memory to be used in bytes for holding file system view, before spilling to disk."); public static final ConfigProperty SPILLABLE_COMPACTION_MEM_FRACTION = ConfigProperty .key("hoodie.filesystem.view.spillable.compaction.mem.fraction") diff --git a/hudi-common/src/main/java/org/apache/hudi/hadoop/FileNameCachingPath.java b/hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java similarity index 54% rename from hudi-common/src/main/java/org/apache/hudi/hadoop/FileNameCachingPath.java rename to hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java index 873f7f98f7c9e..01b3eb9d409bb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/hadoop/FileNameCachingPath.java +++ b/hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java @@ -20,19 +20,47 @@ import org.apache.hadoop.fs.Path; +import javax.annotation.concurrent.ThreadSafe; +import java.io.Serializable; import java.net.URI; /** + * This is an extension of the {@code Path} class allowing to avoid repetitive + * computations (like {@code getFileName}, {@code toString}) which are secured + * by its immutability + * * NOTE: This class is thread-safe */ -public class FileNameCachingPath extends Path { +@ThreadSafe +public class CachingPath extends Path implements Serializable { - // NOTE: volatile keyword is redundant here and put mostly for reader notice, since all + // NOTE: `volatile` keyword is redundant here and put mostly for reader notice, since all // reads/writes to references are always atomic (including 64-bit JVMs) // https://docs.oracle.com/javase/specs/jls/se8/html/jls-17.html#jls-17.7 private volatile String fileName; + private volatile String fullPathStr; + + public CachingPath(String parent, String child) { + super(parent, child); + } + + public CachingPath(Path parent, String child) { + super(parent, child); + } + + public CachingPath(String parent, Path child) { + super(parent, child); + } + + public CachingPath(Path parent, Path child) { + super(parent, child); + } - public FileNameCachingPath(URI aUri) { + public CachingPath(String pathString) throws IllegalArgumentException { + super(pathString); + } + + public CachingPath(URI aUri) { super(aUri); } @@ -45,4 +73,14 @@ public String getName() { } return fileName; } + + @Override + public String toString() { + // This value could be overwritten concurrently and that's okay, since + // {@code Path} is immutable + if (fullPathStr == null) { + fullPathStr = super.toString(); + } + return fullPathStr; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/hadoop/SerializablePath.java b/hudi-common/src/main/java/org/apache/hudi/hadoop/SerializablePath.java new file mode 100644 index 0000000000000..5ad2307ef804a --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/hadoop/SerializablePath.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.Objects; + +/** + * {@link Serializable} wrapper encapsulating {@link Path} + */ +public class SerializablePath implements Serializable { + + private Path path; + + public SerializablePath(Path path) { + this.path = path; + } + + public Path get() { + return path; + } + + private void writeObject(ObjectOutputStream out) throws IOException { + out.writeUTF(path.toString()); + } + + private void readObject(ObjectInputStream in) throws IOException { + String pathStr = in.readUTF(); + path = new CachingPath(pathStr); + } + + @Override + public String toString() { + return path.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SerializablePath that = (SerializablePath) o; + return Objects.equals(path, that.path); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaMerger.java b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaMerger.java index 0d93ab170b374..bcea9b957b3ea 100644 --- a/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaMerger.java +++ b/hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaMerger.java @@ -48,6 +48,25 @@ public class InternalSchemaMerger { // we can pass decimalType to reWriteRecordWithNewSchema directly, everything is ok. private boolean useColumnTypeFromFileSchema = true; + // deal with rename + // Whether to use column name from file schema to read files when we find some column name has changed. + // spark parquetReader need the original column name to read data, otherwise the parquetReader will read nothing. + // eg: current column name is colOldName, now we rename it to colNewName, + // we should not pass colNewName to parquetReader, we must pass colOldName to it; when we read out the data. + // for log reader + // since our reWriteRecordWithNewSchema function support rewrite directly, so we no need this parameter + // eg: current column name is colOldName, now we rename it to colNewName, + // we can pass colNewName to reWriteRecordWithNewSchema directly, everything is ok. + private boolean useColNameFromFileSchema = true; + + public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchema, boolean ignoreRequiredAttribute, boolean useColumnTypeFromFileSchema, boolean useColNameFromFileSchema) { + this.fileSchema = fileSchema; + this.querySchema = querySchema; + this.ignoreRequiredAttribute = ignoreRequiredAttribute; + this.useColumnTypeFromFileSchema = useColumnTypeFromFileSchema; + this.useColNameFromFileSchema = useColNameFromFileSchema; + } + public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchema, boolean ignoreRequiredAttribute, boolean useColumnTypeFromFileSchema) { this.fileSchema = fileSchema; this.querySchema = querySchema; @@ -131,12 +150,15 @@ private List buildRecordType(List oldFields, List fieldNames) { } return result; } + + /** + * Try to find all renamed cols between oldSchema and newSchema. + * + * @param oldSchema oldSchema + * @param newSchema newSchema which modified from oldSchema + * @return renameCols Map. (k, v) -> (colNameFromNewSchema, colNameFromOldSchema) + */ + public static Map collectRenameCols(InternalSchema oldSchema, InternalSchema newSchema) { + List colNamesFromWriteSchema = oldSchema.getAllColsFullName(); + return colNamesFromWriteSchema.stream().filter(f -> { + int filedIdFromWriteSchema = oldSchema.findIdByName(f); + // try to find the cols which has the same id, but have different colName; + return newSchema.getAllIds().contains(filedIdFromWriteSchema) && !newSchema.findfullName(filedIdFromWriteSchema).equalsIgnoreCase(f); + }).collect(Collectors.toMap(e -> newSchema.findfullName(oldSchema.findIdByName(e)), e -> e)); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java index 58d186f971cb8..c9bdc59da9763 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java @@ -54,6 +54,7 @@ import org.apache.hudi.common.util.hash.PartitionIndexID; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.io.storage.HoodieHFileReader; +import org.apache.hudi.util.Lazy; import java.io.IOException; import java.math.BigDecimal; @@ -143,6 +144,31 @@ public class HoodieMetadataPayload implements HoodieRecordPayload METADATA_COLUMN_STATS_BUILDER_STUB = Lazy.lazily(HoodieMetadataColumnStats::newBuilder); + private static final Lazy STRING_WRAPPER_BUILDER_STUB = Lazy.lazily(StringWrapper::newBuilder); + private static final Lazy BYTES_WRAPPER_BUILDER_STUB = Lazy.lazily(BytesWrapper::newBuilder); + private static final Lazy DOUBLE_WRAPPER_BUILDER_STUB = Lazy.lazily(DoubleWrapper::newBuilder); + private static final Lazy FLOAT_WRAPPER_BUILDER_STUB = Lazy.lazily(FloatWrapper::newBuilder); + private static final Lazy LONG_WRAPPER_BUILDER_STUB = Lazy.lazily(LongWrapper::newBuilder); + private static final Lazy INT_WRAPPER_BUILDER_STUB = Lazy.lazily(IntWrapper::newBuilder); + private static final Lazy BOOLEAN_WRAPPER_BUILDER_STUB = Lazy.lazily(BooleanWrapper::newBuilder); + private static final Lazy TIMESTAMP_MICROS_WRAPPER_BUILDER_STUB = Lazy.lazily(TimestampMicrosWrapper::newBuilder); + private static final Lazy DECIMAL_WRAPPER_BUILDER_STUB = Lazy.lazily(DecimalWrapper::newBuilder); + private static final Lazy DATE_WRAPPER_BUILDER_STUB = Lazy.lazily(DateWrapper::newBuilder); + private String key = null; private int type = 0; private Map filesystemMetadata = null; @@ -201,7 +227,7 @@ public HoodieMetadataPayload(Option recordOpt) { checkArgument(record.getSchema().getField(SCHEMA_FIELD_ID_COLUMN_STATS) == null, String.format("Valid %s record expected for type: %s", SCHEMA_FIELD_ID_COLUMN_STATS, METADATA_TYPE_COLUMN_STATS)); } else { - columnStatMetadata = HoodieMetadataColumnStats.newBuilder() + columnStatMetadata = HoodieMetadataColumnStats.newBuilder(METADATA_COLUMN_STATS_BUILDER_STUB.get()) .setFileName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_FILE_NAME)) .setColumnName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_COLUMN_NAME)) .setMinValue(columnStatsRecord.get(COLUMN_STATS_FIELD_MIN_VALUE)) @@ -605,7 +631,7 @@ private static HoodieMetadataColumnStats mergeColumnStatsRecords(HoodieMetadataC .max(Comparator.naturalOrder()) .orElse(null); - return HoodieMetadataColumnStats.newBuilder() + return HoodieMetadataColumnStats.newBuilder(METADATA_COLUMN_STATS_BUILDER_STUB.get()) .setFileName(newColumnStats.getFileName()) .setColumnName(newColumnStats.getColumnName()) .setMinValue(wrapStatisticValue(minValue)) @@ -653,11 +679,13 @@ private static Object wrapStatisticValue(Comparable statValue) { LocalDate localDate = statValue instanceof LocalDate ? (LocalDate) statValue : ((Date) statValue).toLocalDate(); - return DateWrapper.newBuilder().setValue((int) localDate.toEpochDay()).build(); + return DateWrapper.newBuilder(DATE_WRAPPER_BUILDER_STUB.get()) + .setValue((int) localDate.toEpochDay()) + .build(); } else if (statValue instanceof BigDecimal) { Schema valueSchema = DecimalWrapper.SCHEMA$.getField("value").schema(); BigDecimal upcastDecimal = tryUpcastDecimal((BigDecimal) statValue, (LogicalTypes.Decimal) valueSchema.getLogicalType()); - return DecimalWrapper.newBuilder() + return DecimalWrapper.newBuilder(DECIMAL_WRAPPER_BUILDER_STUB.get()) .setValue(AVRO_DECIMAL_CONVERSION.toBytes(upcastDecimal, valueSchema, valueSchema.getLogicalType())) .build(); } else if (statValue instanceof Timestamp) { @@ -665,23 +693,23 @@ private static Object wrapStatisticValue(Comparable statValue) { // rely on logical types to do proper encoding of the native Java types, // and hereby have to encode statistic manually Instant instant = ((Timestamp) statValue).toInstant(); - return TimestampMicrosWrapper.newBuilder() + return TimestampMicrosWrapper.newBuilder(TIMESTAMP_MICROS_WRAPPER_BUILDER_STUB.get()) .setValue(instantToMicros(instant)) .build(); } else if (statValue instanceof Boolean) { - return BooleanWrapper.newBuilder().setValue((Boolean) statValue).build(); + return BooleanWrapper.newBuilder(BOOLEAN_WRAPPER_BUILDER_STUB.get()).setValue((Boolean) statValue).build(); } else if (statValue instanceof Integer) { - return IntWrapper.newBuilder().setValue((Integer) statValue).build(); + return IntWrapper.newBuilder(INT_WRAPPER_BUILDER_STUB.get()).setValue((Integer) statValue).build(); } else if (statValue instanceof Long) { - return LongWrapper.newBuilder().setValue((Long) statValue).build(); + return LongWrapper.newBuilder(LONG_WRAPPER_BUILDER_STUB.get()).setValue((Long) statValue).build(); } else if (statValue instanceof Float) { - return FloatWrapper.newBuilder().setValue((Float) statValue).build(); + return FloatWrapper.newBuilder(FLOAT_WRAPPER_BUILDER_STUB.get()).setValue((Float) statValue).build(); } else if (statValue instanceof Double) { - return DoubleWrapper.newBuilder().setValue((Double) statValue).build(); + return DoubleWrapper.newBuilder(DOUBLE_WRAPPER_BUILDER_STUB.get()).setValue((Double) statValue).build(); } else if (statValue instanceof ByteBuffer) { - return BytesWrapper.newBuilder().setValue((ByteBuffer) statValue).build(); + return BytesWrapper.newBuilder(BYTES_WRAPPER_BUILDER_STUB.get()).setValue((ByteBuffer) statValue).build(); } else if (statValue instanceof String || statValue instanceof Utf8) { - return StringWrapper.newBuilder().setValue(statValue.toString()).build(); + return StringWrapper.newBuilder(STRING_WRAPPER_BUILDER_STUB.get()).setValue(statValue.toString()).build(); } else { throw new UnsupportedOperationException(String.format("Unsupported type of the statistic (%s)", statValue.getClass())); } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 3904ff6f832c9..c0e97f33091dd 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -89,10 +89,10 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema; import static org.apache.hudi.avro.HoodieAvroUtils.addMetadataFields; import static org.apache.hudi.avro.HoodieAvroUtils.convertValueForSpecificDataTypes; import static org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldSchemaFromWriteSchema; -import static org.apache.hudi.avro.HoodieAvroUtils.resolveNullableSchema; import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; import static org.apache.hudi.common.util.ValidationUtils.checkState; import static org.apache.hudi.metadata.HoodieMetadataPayload.unwrapStatisticValueWrapper; diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java index 8c57dc84dead4..bd0254da3dc6e 100644 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java @@ -32,6 +32,7 @@ import java.math.BigDecimal; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -227,6 +228,42 @@ public void testAddingAndRemovingMetadataFields() { assertEquals(NUM_FIELDS_IN_EXAMPLE_SCHEMA, schemaWithoutMetaCols.getFields().size()); } + @Test + public void testRemoveFields() { + // partitioned table test. + String schemaStr = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ " + + "{\"name\": \"timestamp\",\"type\": \"double\"},{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"non_pii_col\", \"type\": \"string\"}]},"; + Schema expectedSchema = new Schema.Parser().parse(schemaStr); + GenericRecord rec = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA)); + rec.put("_row_key", "key1"); + rec.put("non_pii_col", "val1"); + rec.put("pii_col", "val2"); + rec.put("timestamp", 3.5); + GenericRecord rec1 = HoodieAvroUtils.removeFields(rec, Arrays.asList("pii_col")); + assertEquals("key1", rec1.get("_row_key")); + assertEquals("val1", rec1.get("non_pii_col")); + assertEquals(3.5, rec1.get("timestamp")); + assertNull(rec1.get("pii_col")); + assertEquals(expectedSchema, rec1.getSchema()); + + // non-partitioned table test with empty list of fields. + schemaStr = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ " + + "{\"name\": \"timestamp\",\"type\": \"double\"},{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"non_pii_col\", \"type\": \"string\"}," + + "{\"name\": \"pii_col\", \"type\": \"string\"}]},"; + expectedSchema = new Schema.Parser().parse(schemaStr); + rec1 = HoodieAvroUtils.removeFields(rec, Arrays.asList("")); + assertEquals(expectedSchema, rec1.getSchema()); + } + + @Test + public void testGetRootLevelFieldName() { + assertEquals("a", HoodieAvroUtils.getRootLevelFieldName("a.b.c")); + assertEquals("a", HoodieAvroUtils.getRootLevelFieldName("a")); + assertEquals("", HoodieAvroUtils.getRootLevelFieldName("")); + } + @Test public void testGetNestedFieldVal() { GenericRecord rec = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA)); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java index 536fec609542f..4fa53bb41f9f8 100755 --- a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java @@ -89,7 +89,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; import static org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -575,13 +574,12 @@ public void testBasicAppendAndScanMultipleFiles(ExternalSpillableMap.DiskMapType writer.close(); FileCreateUtils.createDeltaCommit(basePath, "100", fs); // scan all log blocks (across multiple log files) - List logFilePaths = logFiles.stream() - .map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()); - assertTrue(logFilePaths.size() > 0); HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(basePath) - .withLogFilePaths(logFilePaths) + .withLogFilePaths( + logFiles.stream() + .map(logFile -> logFile.getPath().toString()).collect(Collectors.toList())) .withReaderSchema(schema) .withLatestInstantTime("100") .withMaxMemorySizeInBytes(10240L) @@ -591,7 +589,6 @@ public void testBasicAppendAndScanMultipleFiles(ExternalSpillableMap.DiskMapType .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(logFilePaths.get(0)).getParent())) .build(); List scannedRecords = new ArrayList<>(); @@ -806,7 +803,6 @@ public void testAvroLogRecordReaderBasic(ExternalSpillableMap.DiskMapType diskMa .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(200, scanner.getTotalLogRecords()); Set readKeys = new HashSet<>(200); @@ -885,7 +881,6 @@ public void testAvroLogRecordReaderWithRollbackTombstone(ExternalSpillableMap.Di .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We read 200 records from 2 write batches"); Set readKeys = new HashSet<>(200); @@ -973,7 +968,6 @@ public void testAvroLogRecordReaderWithFailedPartialBlock(ExternalSpillableMap.D .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We would read 200 records"); Set readKeys = new HashSet<>(200); @@ -1052,7 +1046,6 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(ExternalSpillableMap.Di .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We still would read 200 records"); @@ -1099,7 +1092,6 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(ExternalSpillableMap.Di .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals(200, readKeys.size(), "Stream collect should return all 200 records after rollback of delete"); @@ -1195,7 +1187,6 @@ public void testAvroLogRecordReaderWithDisorderDelete(ExternalSpillableMap.DiskM .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(200, scanner.getTotalLogRecords(), "We still would read 200 records"); @@ -1299,7 +1290,6 @@ public void testAvroLogRecordReaderWithFailedRollbacks(ExternalSpillableMap.Disk .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would have scanned 0 records because of rollback"); @@ -1368,7 +1358,6 @@ public void testAvroLogRecordReaderWithInsertDeleteAndRollback(ExternalSpillable .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); FileCreateUtils.deleteDeltaCommit(basePath, "100", fs); @@ -1420,7 +1409,6 @@ public void testAvroLogRecordReaderWithInvalidRollback(ExternalSpillableMap.Disk .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(100, scanner.getTotalLogRecords(), "We still would read 100 records"); final List readKeys = new ArrayList<>(100); @@ -1491,7 +1479,6 @@ public void testAvroLogRecordReaderWithInsertsDeleteAndRollback(ExternalSpillabl .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); } @@ -1598,7 +1585,6 @@ public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(ExternalS .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records"); FileCreateUtils.deleteDeltaCommit(basePath, "100", fs); @@ -1673,7 +1659,6 @@ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1 .withSpillableMapBasePath(BASE_OUTPUT_PATH) .withDiskMapType(diskMapType) .withBitCaskDiskMapCompressionEnabled(isCompressionEnabled) - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(allLogFiles.get(0)).getParent())) .build(); assertEquals(Math.max(numRecordsInLog1, numRecordsInLog2), scanner.getNumMergedRecordsInLog(), diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java index 59a24a79f013f..e0e57e812b8a2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTableSchemaResolver.java @@ -20,6 +20,7 @@ import org.apache.avro.Schema; +import org.apache.hudi.avro.AvroSchemaUtils; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; @@ -57,7 +58,7 @@ public void testRecreateSchemaWhenDropPartitionColumns() { assertNotEquals(originSchema, s4); assertTrue(s4.getFields().stream().anyMatch(f -> f.name().equals("user_partition"))); Schema.Field f = s4.getField("user_partition"); - assertEquals(f.schema().getType().getName(), "string"); + assertEquals(f.schema(), AvroSchemaUtils.createNullableSchema(Schema.Type.STRING)); // case5: user_partition is in originSchema, but partition_path is in originSchema String[] pts4 = {"user_partition", "partition_path"}; diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java index 27dd9df5edd5d..231915072914d 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileCreateUtils.java @@ -248,6 +248,10 @@ public static void createRequestedRollbackFile(String basePath, String instantTi createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_ROLLBACK_EXTENSION, content); } + public static void createRequestedRollbackFile(String basePath, String instantTime) throws IOException { + createMetaFile(basePath, instantTime, HoodieTimeline.REQUESTED_ROLLBACK_EXTENSION); + } + public static void createInflightRollbackFile(String basePath, String instantTime) throws IOException { createMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_ROLLBACK_EXTENSION); } @@ -399,9 +403,19 @@ public static List getPartitionPaths(Path basePath) throws IOException { if (Files.notExists(basePath)) { return Collections.emptyList(); } - return Files.list(basePath).filter(entry -> (!entry.getFileName().toString().equals(HoodieTableMetaClient.METAFOLDER_NAME) - && !entry.getFileName().toString().contains("parquet") && !entry.getFileName().toString().contains("log")) - && !entry.getFileName().toString().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)).collect(Collectors.toList()); + return Files.list(basePath).filter(entry -> !entry.getFileName().toString().equals(HoodieTableMetaClient.METAFOLDER_NAME) + && !isBaseOrLogFilename(entry.getFileName().toString()) + && !entry.getFileName().toString().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)) + .collect(Collectors.toList()); + } + + public static boolean isBaseOrLogFilename(String filename) { + for (HoodieFileFormat format : HoodieFileFormat.values()) { + if (filename.contains(format.getFileExtension())) { + return true; + } + } + return false; } /** diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java index f0aae0a69d8b3..8cb0661c57d05 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java @@ -282,13 +282,13 @@ public HoodieTestTable addInflightClean(String instantTime, HoodieCleanerPlan cl } public HoodieTestTable addClean(String instantTime, HoodieCleanerPlan cleanerPlan, HoodieCleanMetadata metadata) throws IOException { - return addClean(instantTime, cleanerPlan, metadata, false); + return addClean(instantTime, cleanerPlan, metadata, false, false); } - public HoodieTestTable addClean(String instantTime, HoodieCleanerPlan cleanerPlan, HoodieCleanMetadata metadata, boolean isEmpty) throws IOException { - createRequestedCleanFile(basePath, instantTime, cleanerPlan, isEmpty); - createInflightCleanFile(basePath, instantTime, cleanerPlan, isEmpty); - createCleanFile(basePath, instantTime, metadata, isEmpty); + public HoodieTestTable addClean(String instantTime, HoodieCleanerPlan cleanerPlan, HoodieCleanMetadata metadata, boolean isEmptyForAll, boolean isEmptyCompleted) throws IOException { + createRequestedCleanFile(basePath, instantTime, cleanerPlan, isEmptyForAll); + createInflightCleanFile(basePath, instantTime, cleanerPlan, isEmptyForAll); + createCleanFile(basePath, instantTime, metadata, isEmptyCompleted); currentInstantTime = instantTime; return this; } @@ -335,6 +335,7 @@ public HoodieTestTable addRollback(String instantTime, HoodieRollbackMetadata ro } public HoodieTestTable addRollback(String instantTime, HoodieRollbackMetadata rollbackMetadata, boolean isEmpty) throws IOException { + createRequestedRollbackFile(basePath, instantTime); createInflightRollbackFile(basePath, instantTime); createRollbackFile(basePath, instantTime, rollbackMetadata, isEmpty); currentInstantTime = instantTime; @@ -680,7 +681,8 @@ public FileStatus[] listAllFilesInPartition(String partitionPath) throws IOExcep boolean toReturn = true; String filePath = entry.getPath().toString(); String fileName = entry.getPath().getName(); - if (fileName.startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX) || (!fileName.contains("log") && !fileName.contains("parquet")) + if (fileName.startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX) + || !FileCreateUtils.isBaseOrLogFilename(fileName) || filePath.contains("metadata")) { toReturn = false; } else { diff --git a/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java b/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java index d116697b8dc4a..3850ef07b90a3 100644 --- a/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java @@ -284,7 +284,7 @@ public void testReWriteRecordWithTypeChanged() { .updateColumnType("col6", Types.StringType.get()); InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(internalSchema, updateChange); Schema newAvroSchema = AvroInternalSchemaConverter.convert(newSchema, avroSchema.getName()); - GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema); + GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema, new HashMap<>()); Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newRecord), true); } @@ -349,7 +349,7 @@ public void testReWriteNestRecord() { ); Schema newAvroSchema = AvroInternalSchemaConverter.convert(newRecord, schema.getName()); - GenericRecord newAvroRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema); + GenericRecord newAvroRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema, new HashMap<>()); // test the correctly of rewrite Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newAvroRecord), true); } diff --git a/hudi-common/src/test/resources/log4j-surefire.properties b/hudi-common/src/test/resources/log4j-surefire.properties index c5bdf75ae2ae3..31841a6a3a1ec 100644 --- a/hudi-common/src/test/resources/log4j-surefire.properties +++ b/hudi-common/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java b/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java index 97a682c3a3903..67691a3ec7bd1 100644 --- a/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java +++ b/hudi-examples/hudi-examples-flink/src/test/java/org/apache/hudi/examples/quickstart/TestQuickstartData.java @@ -57,8 +57,6 @@ import java.util.stream.IntStream; import static junit.framework.TestCase.assertEquals; -import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; -import static org.apache.hudi.common.util.CollectionUtils.isNullOrEmpty; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -348,7 +346,7 @@ private static HoodieMergedLogRecordScanner getScanner( List logPaths, Schema readSchema, String instant) { - HoodieMergedLogRecordScanner.Builder logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder() + return HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(basePath) .withLogFilePaths(logPaths) @@ -360,12 +358,8 @@ private static HoodieMergedLogRecordScanner getScanner( .withMaxMemorySizeInBytes(1024 * 1024L) .withSpillableMapBasePath("/tmp/") .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) - .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()); - if (!isNullOrEmpty(logPaths)) { - logRecordScannerBuilder - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(logPaths.get(0)).getParent())); - } - return logRecordScannerBuilder.build(); + .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .build(); } /** diff --git a/hudi-examples/hudi-examples-flink/src/test/resources/log4j-surefire.properties b/hudi-examples/hudi-examples-flink/src/test/resources/log4j-surefire.properties index 8dcd17f303f6b..ac2807e332775 100644 --- a/hudi-examples/hudi-examples-flink/src/test/resources/log4j-surefire.properties +++ b/hudi-examples/hudi-examples-flink/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-examples/hudi-examples-spark/src/test/resources/log4j-surefire.properties b/hudi-examples/hudi-examples-spark/src/test/resources/log4j-surefire.properties index 8dcd17f303f6b..ac2807e332775 100644 --- a/hudi-examples/hudi-examples-spark/src/test/resources/log4j-surefire.properties +++ b/hudi-examples/hudi-examples-spark/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java index e2be7d364b77f..c944f6a299144 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.config.ConfigClassProperty; import org.apache.hudi.common.config.ConfigGroups; import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.config.HoodieIndexConfig; @@ -103,7 +104,7 @@ private FlinkOptions() { .key("metadata.compaction.delta_commits") .intType() .defaultValue(10) - .withDescription("Max delta commits for metadata table to trigger compaction, default 24"); + .withDescription("Max delta commits for metadata table to trigger compaction, default 10"); // ------------------------------------------------------------------------ // Index Options @@ -137,7 +138,7 @@ private FlinkOptions() { .key("index.partition.regex") .stringType() .defaultValue(".*") - .withDescription("Whether to load partitions in state if partition path matching, default *"); + .withDescription("Whether to load partitions in state if partition path matching, default `*`"); // ------------------------------------------------------------------------ // Read Options @@ -367,13 +368,14 @@ private FlinkOptions() { public static final String PARTITION_FORMAT_HOUR = "yyyyMMddHH"; public static final String PARTITION_FORMAT_DAY = "yyyyMMdd"; + public static final String PARTITION_FORMAT_DASHED_DAY = "yyyy-MM-dd"; public static final ConfigOption PARTITION_FORMAT = ConfigOptions .key("write.partition.format") .stringType() .noDefaultValue() .withDescription("Partition path format, only valid when 'write.datetime.partitioning' is true, default is:\n" + "1) 'yyyyMMddHH' for timestamp(3) WITHOUT TIME ZONE, LONG, FLOAT, DOUBLE, DECIMAL;\n" - + "2) 'yyyyMMdd' for DAY and INT."); + + "2) 'yyyyMMdd' for DATE and INT."); public static final ConfigOption INDEX_BOOTSTRAP_TASKS = ConfigOptions .key("write.index_bootstrap.tasks") @@ -541,7 +543,7 @@ private FlinkOptions() { .key("compaction.target_io") .longType() .defaultValue(500 * 1024L) // default 500 GB - .withDescription("Target IO per compaction (both read and write), default 500 GB"); + .withDescription("Target IO in MB for per compaction (both read and write), default 500 GB"); public static final ConfigOption CLEAN_ASYNC_ENABLED = ConfigOptions .key("clean.async.enabled") @@ -549,6 +551,13 @@ private FlinkOptions() { .defaultValue(true) .withDescription("Whether to cleanup the old commits immediately on new commits, enabled by default"); + public static final ConfigOption CLEAN_POLICY = ConfigOptions + .key("clean.policy") + .stringType() + .defaultValue(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name()) + .withDescription("Clean policy to manage the Hudi table. Available option: KEEP_LATEST_COMMITS, KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_BY_HOURS." + + "Default is KEEP_LATEST_COMMITS."); + public static final ConfigOption CLEAN_RETAIN_COMMITS = ConfigOptions .key("clean.retain_commits") .intType() @@ -556,6 +565,12 @@ private FlinkOptions() { .withDescription("Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled).\n" + "This also directly translates into how much you can incrementally pull on this table, default 30"); + public static final ConfigOption CLEAN_RETAIN_FILE_VERSIONS = ConfigOptions + .key("clean.retain_file_versions") + .intType() + .defaultValue(5)// default 5 version + .withDescription("Number of file versions to retain. default 5"); + public static final ConfigOption ARCHIVE_MAX_COMMITS = ConfigOptions .key("archive.max_commits") .intType() diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java index b5ec08a583d43..023b1e696583a 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java @@ -30,6 +30,7 @@ import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.OptionsResolver; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.sink.event.CommitAckEvent; import org.apache.hudi.sink.event.WriteMetadataEvent; import org.apache.hudi.sink.meta.CkpMetadata; import org.apache.hudi.sink.utils.HiveSyncContext; @@ -42,6 +43,7 @@ import org.apache.flink.runtime.jobgraph.OperatorID; import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.TaskNotRunningException; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -429,6 +431,31 @@ private void handleWriteMetaEvent(WriteMetadataEvent event) { addEventToBuffer(event); } + /** + * The coordinator reuses the instant if there is no data for this round of checkpoint, + * sends the commit ack events to unblock the flushing. + */ + private void sendCommitAckEvents(long checkpointId) { + CompletableFuture[] futures = Arrays.stream(this.gateways).filter(Objects::nonNull) + .map(gw -> gw.sendEvent(CommitAckEvent.getInstance(checkpointId))) + .toArray(CompletableFuture[]::new); + CompletableFuture.allOf(futures).whenComplete((resp, error) -> { + if (!sendToFinishedTasks(error)) { + throw new HoodieException("Error while waiting for the commit ack events to finish sending", error); + } + }); + } + + /** + * Decides whether the given exception is caused by sending events to FINISHED tasks. + * + *

Ugly impl: the exception may change in the future. + */ + private static boolean sendToFinishedTasks(Throwable throwable) { + return throwable.getCause() instanceof TaskNotRunningException + || throwable.getCause().getMessage().contains("running"); + } + /** * Commits the instant. */ @@ -456,6 +483,10 @@ private boolean commitInstant(String instant, long checkpointId) { if (writeResults.size() == 0) { // No data has written, reset the buffer and returns early reset(); + // Send commit ack event to the write function to unblock the flushing + // If this checkpoint has no inputs while the next checkpoint has inputs, + // the 'isConfirming' flag should be switched with the ack event. + sendCommitAckEvents(checkpointId); return false; } doCommit(instant, writeResults); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java index a72b885a22c76..7b40718b35c24 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/append/AppendWriteFunction.java @@ -123,9 +123,9 @@ private void flushData(boolean endInput) { writeStatus = this.writerHelper.getWriteStatuses(this.taskID); instant = this.writerHelper.getInstantTime(); } else { - LOG.info("No data to write in subtask [{}] for instant [{}]", taskID, currentInstant); writeStatus = Collections.emptyList(); instant = instantToWrite(false); + LOG.info("No data to write in subtask [{}] for instant [{}]", taskID, instant); } final WriteMetadataEvent event = WriteMetadataEvent.builder() .taskID(taskID) diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java index 1456e8882f024..11d5f36436b78 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java @@ -75,11 +75,6 @@ public class BucketStreamWriteFunction extends StreamWriteFunction { */ private Set incBucketIndex; - /** - * Returns whether this is an empty table. - */ - private boolean isEmptyTable; - /** * Constructs a BucketStreamWriteFunction. * @@ -99,7 +94,6 @@ public void open(Configuration parameters) throws IOException { this.bucketToLoad = getBucketToLoad(); this.bucketIndex = new HashMap<>(); this.incBucketIndex = new HashSet<>(); - this.isEmptyTable = !this.metaClient.getActiveTimeline().filterCompletedInstants().lastInstant().isPresent(); } @Override @@ -162,7 +156,7 @@ private Set getBucketToLoad() { * This is a required operation for each restart to avoid having duplicate file ids for one bucket. */ private void bootstrapIndexIfNeed(String partition) { - if (isEmptyTable || bucketIndex.containsKey(partition)) { + if (bucketIndex.containsKey(partition)) { return; } LOG.info(String.format("Loading Hoodie Table %s, with path %s", this.metaClient.getTableConfig().getTableName(), @@ -170,8 +164,8 @@ private void bootstrapIndexIfNeed(String partition) { // Load existing fileID belongs to this task Map bucketToFileIDMap = new HashMap<>(); - this.writeClient.getHoodieTable().getHoodieView().getLatestFileSlices(partition).forEach(fileSlice -> { - String fileID = fileSlice.getFileId(); + this.writeClient.getHoodieTable().getFileSystemView().getAllFileGroups(partition).forEach(fileGroup -> { + String fileID = fileGroup.getFileGroupId().getFileId(); int bucketNumber = BucketIdentifier.bucketIdFromFileId(fileID); if (bucketToLoad.contains(bucketNumber)) { LOG.info(String.format("Should load this partition bucket %s with fileID %s", bucketNumber, fileID)); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java index 4e8712b6619f2..98085fa74f5a7 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/common/AbstractStreamWriteFunction.java @@ -247,7 +247,7 @@ protected String instantToWrite(boolean hasData) { // wait condition: // 1. there is no inflight instant // 2. the inflight instant does not change and the checkpoint has buffering data - if (instant == null || (instant.equals(this.currentInstant) && hasData && !this.ckpMetadata.isAborted(instant))) { + if (instant == null || invalidInstant(instant, hasData)) { // sleep for a while timeWait.waitFor(); // refresh the inflight instant @@ -260,4 +260,11 @@ protected String instantToWrite(boolean hasData) { } return instant; } + + /** + * Returns whether the pending instant is invalid to write with. + */ + private boolean invalidInstant(String instant, boolean hasData) { + return instant.equals(this.currentInstant) && hasData && !this.ckpMetadata.isAborted(instant); + } } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java index ecd66936e88c3..c9fb7aceb2e08 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java @@ -101,11 +101,6 @@ public void open(Configuration parameters) throws Exception { @Override public void invoke(CompactionCommitEvent event, Context context) throws Exception { final String instant = event.getInstant(); - if (event.isFailed()) { - // handle failure case - CompactionUtil.rollbackCompaction(table, event.getInstant()); - return; - } commitBuffer.computeIfAbsent(instant, k -> new HashMap<>()) .put(event.getFileId(), event); commitIfNecessary(instant, commitBuffer.get(instant).values()); @@ -132,6 +127,18 @@ private void commitIfNecessary(String instant, Collection if (!isReady) { return; } + + if (events.stream().anyMatch(CompactionCommitEvent::isFailed)) { + try { + // handle failure case + CompactionUtil.rollbackCompaction(table, instant); + } finally { + // remove commitBuffer to avoid obsolete metadata commit + reset(instant); + return; + } + } + try { doCommit(instant, events); } catch (Throwable throwable) { diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java index 6df11fe2242f3..48d4f48989b0a 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionPlanOperator.java @@ -88,8 +88,7 @@ public void notifyCheckpointComplete(long checkpointId) { // when the earliest inflight instant has timed out, assumes it has failed // already and just rolls it back. - // comment out: do we really need the timeout rollback ? - // CompactionUtil.rollbackEarliestCompaction(table, conf); + CompactionUtil.rollbackEarliestCompaction(table, conf); scheduleCompaction(table, checkpointId); } catch (Throwable throwable) { // make it fail-safe @@ -99,7 +98,8 @@ public void notifyCheckpointComplete(long checkpointId) { private void scheduleCompaction(HoodieFlinkTable table, long checkpointId) throws IOException { // the first instant takes the highest priority. - Option firstRequested = table.getActiveTimeline().filterPendingCompactionTimeline() + HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline(); + Option firstRequested = pendingCompactionTimeline .filter(instant -> instant.getState() == HoodieInstant.State.REQUESTED).firstInstant(); if (!firstRequested.isPresent()) { // do nothing. @@ -107,6 +107,13 @@ private void scheduleCompaction(HoodieFlinkTable table, long checkpointId) th return; } + Option firstInflight = pendingCompactionTimeline + .filter(instant -> instant.getState() == HoodieInstant.State.INFLIGHT).firstInstant(); + if (firstInflight.isPresent()) { + LOG.warn("Waiting for pending compaction instant : " + firstInflight + " to complete, skip scheduling new compaction plans"); + return; + } + String compactionInstantTime = firstRequested.get().getTimestamp(); // generate compaction plan diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java index 52ffa85659161..536a0282fbcc4 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/HiveSyncContext.java @@ -18,6 +18,7 @@ package org.apache.hudi.sink.utils; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.hudi.aws.sync.AwsGlueCatalogSyncTool; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.configuration.FlinkOptions; @@ -70,7 +71,8 @@ public static HiveSyncContext create(Configuration conf) { return new HiveSyncContext(syncConfig, hiveConf, fs); } - private static HiveSyncConfig buildSyncConfig(Configuration conf) { + @VisibleForTesting + public static HiveSyncConfig buildSyncConfig(Configuration conf) { HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(); hiveSyncConfig.basePath = conf.getString(FlinkOptions.PATH); hiveSyncConfig.baseFileFormat = conf.getString(FlinkOptions.HIVE_SYNC_FILE_FORMAT); @@ -83,7 +85,7 @@ private static HiveSyncConfig buildSyncConfig(Configuration conf) { hiveSyncConfig.tableProperties = conf.getString(FlinkOptions.HIVE_SYNC_TABLE_PROPERTIES); hiveSyncConfig.serdeProperties = conf.getString(FlinkOptions.HIVE_SYNC_TABLE_SERDE_PROPERTIES); hiveSyncConfig.jdbcUrl = conf.getString(FlinkOptions.HIVE_SYNC_JDBC_URL); - hiveSyncConfig.partitionFields = Arrays.asList(FilePathUtils.extractPartitionKeys(conf)); + hiveSyncConfig.partitionFields = Arrays.asList(FilePathUtils.extractHivePartitionFields(conf)); hiveSyncConfig.partitionValueExtractorClass = conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME); hiveSyncConfig.useJdbc = conf.getBoolean(FlinkOptions.HIVE_SYNC_USE_JDBC); hiveSyncConfig.useFileListingFromMetadata = conf.getBoolean(FlinkOptions.METADATA_ENABLED); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java index 28a669075da3c..3b2ee39528a8b 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java @@ -173,9 +173,19 @@ public static DataStreamSink bulkInsert(Configuration conf, RowType rowT * @param conf The configuration * @param rowType The input row type * @param dataStream The input data stream + * @param bounded Whether the input stream is bounded * @return the appending data stream sink */ - public static DataStreamSink append(Configuration conf, RowType rowType, DataStream dataStream) { + public static DataStreamSink append( + Configuration conf, + RowType rowType, + DataStream dataStream, + boolean bounded) { + if (!bounded) { + // In principle, the config should be immutable, but the boundedness + // is only visible when creating the sink pipeline. + conf.setBoolean(FlinkOptions.WRITE_BULK_INSERT_SORT_INPUT, false); + } WriteOperatorFactory operatorFactory = AppendWriteOperator.getFactory(conf, rowType); return dataStream diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java index 592520bf902f8..f82712bca2c2a 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/streamer/FlinkStreamerConfig.java @@ -19,6 +19,7 @@ package org.apache.hudi.streamer; import org.apache.hudi.client.utils.OperationConverter; +import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.StringUtils; @@ -260,11 +261,20 @@ public class FlinkStreamerConfig extends Configuration { @Parameter(names = {"--clean-async-enabled"}, description = "Whether to cleanup the old commits immediately on new commits, enabled by default") public Boolean cleanAsyncEnabled = true; + @Parameter(names = {"--clean-policy"}, + description = "Clean policy to manage the Hudi table. Available option: KEEP_LATEST_COMMITS, KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_BY_HOURS." + + "Default is KEEP_LATEST_COMMITS.") + public String cleanPolicy = HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name(); + @Parameter(names = {"--clean-retain-commits"}, description = "Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled).\n" + "This also directly translates into how much you can incrementally pull on this table, default 10") public Integer cleanRetainCommits = 10; + @Parameter(names = {"--clean-retain-file-versions"}, + description = "Number of file versions to retain. Each file group will be retained for this number of version. default 5") + public Integer cleanRetainFileVersions = 5; + @Parameter(names = {"--archive-max-commits"}, description = "Max number of commits to keep before archiving older commits into a sequential log, default 30") public Integer archiveMaxCommits = 30; @@ -392,7 +402,9 @@ public static org.apache.flink.configuration.Configuration toFlinkConfig(FlinkSt conf.setInteger(FlinkOptions.COMPACTION_MAX_MEMORY, config.compactionMaxMemory); conf.setLong(FlinkOptions.COMPACTION_TARGET_IO, config.compactionTargetIo); conf.setBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED, config.cleanAsyncEnabled); + conf.setString(FlinkOptions.CLEAN_POLICY, config.cleanPolicy); conf.setInteger(FlinkOptions.CLEAN_RETAIN_COMMITS, config.cleanRetainCommits); + conf.setInteger(FlinkOptions.CLEAN_RETAIN_FILE_VERSIONS, config.cleanRetainFileVersions); conf.setInteger(FlinkOptions.ARCHIVE_MAX_COMMITS, config.archiveMaxCommits); conf.setInteger(FlinkOptions.ARCHIVE_MIN_COMMITS, config.archiveMinCommits); conf.setBoolean(FlinkOptions.HIVE_SYNC_ENABLED, config.hiveSyncEnabled); diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java index 7543382e19df4..987ae10fe75ce 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java @@ -20,6 +20,7 @@ import org.apache.hudi.common.model.DefaultHoodieRecordPayload; import org.apache.hudi.common.model.EventTimeAvroPayload; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.OptionsResolver; import org.apache.hudi.exception.HoodieValidationException; @@ -52,6 +53,9 @@ import java.util.Collections; import java.util.List; import java.util.Set; +import java.util.concurrent.TimeUnit; + +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; /** * Hoodie data source/sink factory. @@ -81,6 +85,8 @@ public DynamicTableSource createDynamicTableSource(Context context) { @Override public DynamicTableSink createDynamicTableSink(Context context) { Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions()); + checkArgument(!StringUtils.isNullOrEmpty(conf.getString(FlinkOptions.PATH)), + "Option [path] should not be empty."); ResolvedSchema schema = context.getCatalogTable().getResolvedSchema(); sanityCheck(conf, schema); setupConfOptions(conf, context.getObjectIdentifier().getObjectName(), context.getCatalogTable(), schema); @@ -238,6 +244,11 @@ private static void setupHoodieKeyOptions(Configuration conf, CatalogTable table *

The UTC timezone is used as default. */ public static void setupTimestampKeygenOptions(Configuration conf, DataType fieldType) { + if (conf.contains(FlinkOptions.KEYGEN_CLASS_NAME)) { + // the keygen clazz has been set up explicitly, skipping + return; + } + conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, TimestampBasedAvroKeyGenerator.class.getName()); LOG.info("Table option [{}] is reset to {} because datetime partitioning turns on", FlinkOptions.KEYGEN_CLASS_NAME.key(), TimestampBasedAvroKeyGenerator.class.getName()); @@ -252,13 +263,17 @@ public static void setupTimestampKeygenOptions(Configuration conf, DataType fiel conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, TimestampBasedAvroKeyGenerator.TimestampType.EPOCHMILLISECONDS.name()); } - String partitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_HOUR); - conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, partitionFormat); + String outputPartitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_HOUR); + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, outputPartitionFormat); } else { conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, - TimestampBasedAvroKeyGenerator.TimestampType.DATE_STRING.name()); - String partitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_DAY); - conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, partitionFormat); + TimestampBasedAvroKeyGenerator.TimestampType.SCALAR.name()); + conf.setString(KeyGeneratorOptions.Config.INPUT_TIME_UNIT, TimeUnit.DAYS.toString()); + + String outputPartitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_DAY); + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, outputPartitionFormat); + // the option is actually useless, it only works for validation + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, FlinkOptions.PARTITION_FORMAT_DAY); } conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, "UTC"); } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java index ed99e7b4c1c3d..4dd4f89d03c1d 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java @@ -78,7 +78,7 @@ public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { // Append mode if (OptionsResolver.isAppendMode(conf)) { - return Pipelines.append(conf, rowType, dataStream); + return Pipelines.append(conf, rowType, dataStream, context.isBounded()); } // default parallelism diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java index 523062590ea9e..99efa0b36a7ae 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FilePathUtils.java @@ -425,4 +425,17 @@ public static String[] extractPartitionKeys(org.apache.flink.configuration.Confi } return conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(","); } + + /** + * Extracts the hive sync partition fields with given configuration. + * + * @param conf The flink configuration + * @return array of the hive partition fields + */ + public static String[] extractHivePartitionFields(org.apache.flink.configuration.Configuration conf) { + if (FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.HIVE_SYNC_PARTITION_FIELDS)) { + return extractPartitionKeys(conf); + } + return conf.getString(FlinkOptions.HIVE_SYNC_PARTITION_FIELDS).split(","); + } } diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java index f01993edc61c4..fce9b75f764ea 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.format; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; @@ -42,7 +43,6 @@ import org.apache.flink.types.RowKind; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import java.util.ArrayList; import java.util.Arrays; @@ -52,10 +52,6 @@ import java.util.Map; import java.util.function.Function; -import static org.apache.hudi.common.fs.FSUtils.getFs; -import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; -import static org.apache.hudi.common.util.CollectionUtils.isNullOrEmpty; - /** * Utilities for format. */ @@ -128,13 +124,11 @@ public static HoodieMergedLogRecordScanner logScanner( Schema logSchema, Configuration config, boolean withOperationField) { - String basePath = split.getTablePath(); - List logPaths = split.getLogPaths().get(); - FileSystem fs = getFs(basePath, config); - HoodieMergedLogRecordScanner.Builder logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder() + FileSystem fs = FSUtils.getFs(split.getTablePath(), config); + return HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) - .withBasePath(basePath) - .withLogFilePaths(logPaths) + .withBasePath(split.getTablePath()) + .withLogFilePaths(split.getLogPaths().get()) .withReaderSchema(logSchema) .withLatestInstantTime(split.getLatestCommit()) .withReadBlocksLazily( @@ -150,12 +144,8 @@ public static HoodieMergedLogRecordScanner logScanner( config.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) .withInstantRange(split.getInstantRange()) - .withOperationField(withOperationField); - if (!isNullOrEmpty(logPaths)) { - logRecordScannerBuilder - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(logPaths.get(0)).getParent())); - } - return logRecordScannerBuilder.build(); + .withOperationField(withOperationField) + .build(); } private static HoodieUnMergedLogRecordScanner unMergedLogScanner( @@ -163,7 +153,7 @@ private static HoodieUnMergedLogRecordScanner unMergedLogScanner( Schema logSchema, Configuration config, HoodieUnMergedLogRecordScanner.LogRecordScannerCallback callback) { - FileSystem fs = getFs(split.getTablePath(), config); + FileSystem fs = FSUtils.getFs(split.getTablePath(), config); return HoodieUnMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(split.getTablePath()) @@ -244,8 +234,8 @@ public static HoodieMergedLogRecordScanner logScanner( HoodieWriteConfig writeConfig, Configuration hadoopConf) { String basePath = writeConfig.getBasePath(); - HoodieMergedLogRecordScanner.Builder logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder() - .withFileSystem(getFs(basePath, hadoopConf)) + return HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(FSUtils.getFs(basePath, hadoopConf)) .withBasePath(basePath) .withLogFilePaths(logPaths) .withReaderSchema(logSchema) @@ -256,12 +246,8 @@ public static HoodieMergedLogRecordScanner logScanner( .withMaxMemorySizeInBytes(writeConfig.getMaxMemoryPerPartitionMerge()) .withSpillableMapBasePath(writeConfig.getSpillableMapBasePath()) .withDiskMapType(writeConfig.getCommonConfig().getSpillableDiskMapType()) - .withBitCaskDiskMapCompressionEnabled(writeConfig.getCommonConfig().isBitCaskDiskMapCompressionEnabled()); - if (!isNullOrEmpty(logPaths)) { - logRecordScannerBuilder - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(logPaths.get(0)).getParent())); - } - return logRecordScannerBuilder.build(); + .withBitCaskDiskMapCompressionEnabled(writeConfig.getCommonConfig().isBitCaskDiskMapCompressionEnabled()) + .build(); } private static Boolean string2Boolean(String s) { diff --git a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java index 3138d5d986167..dfbe0efd67c70 100644 --- a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java +++ b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java @@ -176,11 +176,12 @@ public static HoodieWriteConfig getHoodieClientConfig( .withMaxDeltaSecondsBeforeCompaction(conf.getInteger(FlinkOptions.COMPACTION_DELTA_SECONDS)) .withAsyncClean(conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED)) .retainCommits(conf.getInteger(FlinkOptions.CLEAN_RETAIN_COMMITS)) + .retainFileVersions(conf.getInteger(FlinkOptions.CLEAN_RETAIN_FILE_VERSIONS)) // override and hardcode to 20, // actually Flink cleaning is always with parallelism 1 now .withCleanerParallelism(20) .archiveCommitsWith(conf.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS), conf.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS)) - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .withCleanerPolicy(HoodieCleaningPolicy.valueOf(conf.getString(FlinkOptions.CLEAN_POLICY))) .build()) .withMemoryConfig( HoodieMemoryConfig.newBuilder() diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestRowDataKeyGen.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestRowDataKeyGen.java index 822df063b5ab2..5643ca8d04744 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestRowDataKeyGen.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/bulk/TestRowDataKeyGen.java @@ -29,6 +29,8 @@ import org.apache.flink.table.data.StringData; import org.apache.flink.table.data.TimestampData; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import static org.apache.hudi.utils.TestData.insertRow; import static org.hamcrest.CoreMatchers.is; @@ -126,4 +128,40 @@ void testTimestampBasedKeyGenerator() { assertThat(keyGen2.getPartitionPath(rowData2), is("ts=1970010100")); assertThat(keyGen2.getPartitionPath(rowData3), is("ts=1970010100")); } + + @ParameterizedTest + @ValueSource(strings = {FlinkOptions.PARTITION_FORMAT_DASHED_DAY, FlinkOptions.PARTITION_FORMAT_DAY}) + void testDateBasedKeyGenerator(String partitionFormat) { + boolean dashed = partitionFormat.equals(FlinkOptions.PARTITION_FORMAT_DASHED_DAY); + Configuration conf = TestConfigurations.getDefaultConf("path1", TestConfigurations.ROW_DATA_TYPE_DATE); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "dt"); + conf.setString(FlinkOptions.PARTITION_FORMAT, partitionFormat); + HoodieTableFactory.setupTimestampKeygenOptions(conf, DataTypes.DATE()); + final RowData rowData1 = insertRow(TestConfigurations.ROW_TYPE_DATE, + StringData.fromString("id1"), StringData.fromString("Danny"), 23, 1); + final RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE_DATE); + + assertThat(keyGen1.getRecordKey(rowData1), is("id1")); + String expectedPartition1 = dashed ? "1970-01-02" : "19700102"; + assertThat(keyGen1.getPartitionPath(rowData1), is(expectedPartition1)); + + // null record key and partition path + final RowData rowData2 = insertRow(TestConfigurations.ROW_TYPE_DATE, null, StringData.fromString("Danny"), 23, null); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2)); + String expectedPartition2 = dashed ? "1970-01-02" : "19700102"; + assertThat(keyGen1.getPartitionPath(rowData2), is(expectedPartition2)); + + // empty record key + String expectedPartition3 = dashed ? "1970-01-03" : "19700103"; + final RowData rowData3 = insertRow(TestConfigurations.ROW_TYPE_DATE, StringData.fromString(""), StringData.fromString("Danny"), 23, 2); + assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3)); + assertThat(keyGen1.getPartitionPath(rowData3), is(expectedPartition3)); + + // hive style partitioning + conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true); + final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE_DATE); + assertThat(keyGen2.getPartitionPath(rowData1), is("dt=" + expectedPartition1)); + assertThat(keyGen2.getPartitionPath(rowData2), is("dt=" + expectedPartition2)); + assertThat(keyGen2.getPartitionPath(rowData3), is("dt=" + expectedPartition3)); + } } diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestHiveSyncContext.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestHiveSyncContext.java new file mode 100644 index 0000000000000..7bfaade59ea26 --- /dev/null +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/utils/TestHiveSyncContext.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.utils; + +import org.apache.flink.configuration.Configuration; + +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.hive.HiveSyncConfig; + +import org.junit.jupiter.api.Test; + +import java.lang.reflect.Method; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Test cases for {@link HiveSyncContext}. + */ +public class TestHiveSyncContext { + /** + * Test that the file ids generated by the task can finally shuffled to itself. + */ + @Test + public void testBuildSyncConfig() throws Exception { + Configuration configuration1 = new Configuration(); + Configuration configuration2 = new Configuration(); + String hiveSyncPartitionField = "hiveSyncPartitionField"; + String partitionPathField = "partitionPathField"; + + configuration1.setString(FlinkOptions.HIVE_SYNC_PARTITION_FIELDS, hiveSyncPartitionField); + configuration1.setString(FlinkOptions.PARTITION_PATH_FIELD, partitionPathField); + + configuration2.setString(FlinkOptions.PARTITION_PATH_FIELD, partitionPathField); + + Class threadClazz = Class.forName("org.apache.hudi.sink.utils.HiveSyncContext"); + Method buildSyncConfigMethod = threadClazz.getDeclaredMethod("buildSyncConfig", Configuration.class); + buildSyncConfigMethod.setAccessible(true); + + HiveSyncConfig hiveSyncConfig1 = HiveSyncContext.buildSyncConfig(configuration1); + HiveSyncConfig hiveSyncConfig2 = HiveSyncContext.buildSyncConfig(configuration2); + + assertTrue(hiveSyncConfig1.partitionFields.get(0).equals(hiveSyncPartitionField)); + assertTrue(hiveSyncConfig2.partitionFields.get(0).equals(partitionPathField)); + + } +} diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java index 786a45cac7ac9..088ddb260dd5f 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/ITTestHoodieDataSource.java @@ -1028,6 +1028,37 @@ void testWriteAndReadWithTimestampPartitioning(ExecMode execMode) { + "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]"); } + @ParameterizedTest + @ValueSource(strings = {FlinkOptions.PARTITION_FORMAT_DAY, FlinkOptions.PARTITION_FORMAT_DASHED_DAY}) + void testWriteAndReadWithDatePartitioning(String partitionFormat) { + TableEnvironment tableEnv = batchTableEnv; + String hoodieTableDDL = sql("t1") + .field("uuid varchar(20)") + .field("name varchar(10)") + .field("age int") + .field("ts date") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath()) + .option(FlinkOptions.PARTITION_FORMAT, partitionFormat) + .partitionField("ts") // use date as partition path field + .end(); + tableEnv.executeSql(hoodieTableDDL); + + execInsertSql(tableEnv, TestSQL.INSERT_DATE_PARTITION_T1); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + String expected = "[" + + "+I[id1, Danny, 23, 1970-01-01], " + + "+I[id2, Stephen, 33, 1970-01-01], " + + "+I[id3, Julian, 53, 1970-01-01], " + + "+I[id4, Fabian, 31, 1970-01-01], " + + "+I[id5, Sophia, 18, 1970-01-01], " + + "+I[id6, Emma, 20, 1970-01-01], " + + "+I[id7, Bob, 44, 1970-01-01], " + + "+I[id8, Han, 56, 1970-01-01]]"; + assertRowsEquals(result, expected); + } + @ParameterizedTest @ValueSource(strings = {"bulk_insert", "upsert"}) void testWriteReadDecimals(String operation) { diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java index c6a1b0068aa50..efd365064454d 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java @@ -419,7 +419,6 @@ void testSetupCleaningOptionsForSink() { @Test void testSetupTimestampBasedKeyGenForSink() { this.conf.setString(FlinkOptions.RECORD_KEY_FIELD, "dummyField"); - this.conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, "dummyKeyGenClass"); // definition with simple primary key and partition path ResolvedSchema schema1 = SchemaBuilder.instance() .field("f0", DataTypes.INT().notNull()) diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java index e1106671799b7..f2e8f1ab67a7c 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java @@ -20,6 +20,7 @@ import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.streamer.FlinkStreamerConfig; +import org.apache.hudi.util.AvroSchemaConverter; import org.apache.hudi.utils.factory.CollectSinkTableFactory; import org.apache.hudi.utils.factory.ContinuousFileSourceFactory; @@ -74,6 +75,15 @@ private TestConfigurations() { public static final RowType ROW_TYPE_WIDER = (RowType) ROW_DATA_TYPE_WIDER.getLogicalType(); + public static final DataType ROW_DATA_TYPE_DATE = DataTypes.ROW( + DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key + DataTypes.FIELD("name", DataTypes.VARCHAR(10)), + DataTypes.FIELD("age", DataTypes.INT()), + DataTypes.FIELD("dt", DataTypes.DATE())) + .notNull(); + + public static final RowType ROW_TYPE_DATE = (RowType) ROW_DATA_TYPE_DATE.getLogicalType(); + public static String getCreateHoodieTableDDL(String tableName, Map options) { return getCreateHoodieTableDDL(tableName, options, true, "partition"); } @@ -212,6 +222,15 @@ public static Configuration getDefaultConf(String tablePath) { return conf; } + public static Configuration getDefaultConf(String tablePath, DataType dataType) { + Configuration conf = new Configuration(); + conf.setString(FlinkOptions.PATH, tablePath); + conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, AvroSchemaConverter.convertToSchema(dataType.getLogicalType()).toString()); + conf.setString(FlinkOptions.TABLE_NAME, "TestHoodieTable"); + conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition"); + return conf; + } + public static FlinkStreamerConfig getDefaultStreamerConf(String tablePath) { FlinkStreamerConfig streamerConf = new FlinkStreamerConfig(); streamerConf.targetBasePath = tablePath; diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java index f2439b4471d3c..c1e924056cfa2 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestData.java @@ -67,8 +67,6 @@ import java.util.stream.IntStream; import static junit.framework.TestCase.assertEquals; -import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; -import static org.apache.hudi.common.util.CollectionUtils.isNullOrEmpty; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -663,7 +661,7 @@ private static HoodieMergedLogRecordScanner getScanner( List logPaths, Schema readSchema, String instant) { - HoodieMergedLogRecordScanner.Builder logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder() + return HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(basePath) .withLogFilePaths(logPaths) @@ -675,12 +673,8 @@ private static HoodieMergedLogRecordScanner getScanner( .withMaxMemorySizeInBytes(1024 * 1024L) .withSpillableMapBasePath("/tmp/") .withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()) - .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()); - if (!isNullOrEmpty(logPaths)) { - logRecordScannerBuilder - .withPartition(getRelativePartitionPath(new Path(basePath), new Path(logPaths.get(0)).getParent())); - } - return logRecordScannerBuilder.build(); + .withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()) + .build(); } /** diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java index 1695e4e7149a9..b109fee0fff2a 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java +++ b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java @@ -61,4 +61,14 @@ private TestSQL() { + "(1, array['abc1', 'def1'], array[1, 1], map['abc1', 1, 'def1', 3], row(array['abc1', 'def1'], row(1, 'abc1'))),\n" + "(2, array['abc2', 'def2'], array[2, 2], map['abc2', 1, 'def2', 3], row(array['abc2', 'def2'], row(2, 'abc2'))),\n" + "(3, array['abc3', 'def3'], array[3, 3], map['abc3', 1, 'def3', 3], row(array['abc3', 'def3'], row(3, 'abc3')))"; + + public static final String INSERT_DATE_PARTITION_T1 = "insert into t1 values\n" + + "('id1','Danny',23,DATE '1970-01-01'),\n" + + "('id2','Stephen',33,DATE '1970-01-01'),\n" + + "('id3','Julian',53,DATE '1970-01-01'),\n" + + "('id4','Fabian',31,DATE '1970-01-01'),\n" + + "('id5','Sophia',18,DATE '1970-01-01'),\n" + + "('id6','Emma',20,DATE '1970-01-01'),\n" + + "('id7','Bob',44,DATE '1970-01-01'),\n" + + "('id8','Han',56,DATE '1970-01-01')"; } diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/log4j-surefire.properties b/hudi-flink-datasource/hudi-flink/src/test/resources/log4j-surefire.properties index 5806188cb5c58..6108218dc582a 100644 --- a/hudi-flink-datasource/hudi-flink/src/test/resources/log4j-surefire.properties +++ b/hudi-flink-datasource/hudi-flink/src/test/resources/log4j-surefire.properties @@ -19,9 +19,9 @@ log4j.rootLogger=INFO, CONSOLE log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java index 9618f5f7caded..b917f004bcd06 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java @@ -18,6 +18,12 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; @@ -29,27 +35,15 @@ import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.Set; -import static org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath; -import static org.apache.hudi.common.util.CollectionUtils.isNullOrEmpty; - class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader implements RecordReader { @@ -83,11 +77,10 @@ private HoodieMergedLogRecordScanner getMergedLogRecordScanner() throws IOExcept // NOTE: HoodieCompactedLogRecordScanner will not return records for an in-flight commit // but can return records for completed commits > the commit we are trying to read (if using // readCommit() API) - List logPaths = split.getDeltaLogPaths(); - HoodieMergedLogRecordScanner.Builder logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder() + return HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(FSUtils.getFs(split.getPath().toString(), jobConf)) .withBasePath(split.getBasePath()) - .withLogFilePaths(logPaths) + .withLogFilePaths(split.getDeltaLogPaths()) .withReaderSchema(usesCustomPayload ? getWriterSchema() : getReaderSchema()) .withLatestInstantTime(split.getMaxCommitTime()) .withMaxMemorySizeInBytes(HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes(jobConf)) @@ -97,12 +90,8 @@ private HoodieMergedLogRecordScanner getMergedLogRecordScanner() throws IOExcept .withSpillableMapBasePath(jobConf.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) .withDiskMapType(jobConf.getEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue())) .withBitCaskDiskMapCompressionEnabled(jobConf.getBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), - HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue())); - if (!isNullOrEmpty(logPaths)) { - logRecordScannerBuilder - .withPartition(getRelativePartitionPath(new Path(split.getBasePath()), new Path(logPaths.get(0)).getParent())); - } - return logRecordScannerBuilder.build(); + HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue())) + .build(); } private Option buildGenericRecordwithCustomPayload(HoodieRecord record) throws IOException { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java index 0aa74ef154334..0e4f9c304cb2b 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeRecordReaderUtils.java @@ -18,13 +18,7 @@ package org.apache.hudi.hadoop.utils; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; -import org.apache.hudi.io.storage.HoodieFileReader; -import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.avro.JsonProperties; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; import org.apache.avro.generic.GenericArray; @@ -32,8 +26,8 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils; @@ -46,6 +40,12 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -60,6 +60,9 @@ import java.util.TreeMap; import java.util.stream.Collectors; +import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchema; +import static org.apache.hudi.avro.AvroSchemaUtils.createNullableSchema; + public class HoodieRealtimeRecordReaderUtils { private static final Logger LOG = LogManager.getLogger(HoodieRealtimeRecordReaderUtils.class); @@ -287,6 +290,14 @@ public static Schema addPartitionFields(Schema schema, List partitioning List fieldsToAdd = partitioningFields.stream().map(String::toLowerCase) .filter(x -> !firstLevelFieldNames.contains(x)).collect(Collectors.toList()); - return HoodieAvroUtils.appendNullSchemaFields(schema, fieldsToAdd); + return appendNullSchemaFields(schema, fieldsToAdd); + } + + private static Schema appendNullSchemaFields(Schema schema, List newFieldNames) { + List newFields = new ArrayList<>(); + for (String newField : newFieldNames) { + newFields.add(new Schema.Field(newField, createNullableSchema(Schema.Type.STRING), "", JsonProperties.NULL_VALUE)); + } + return appendFieldsToSchema(schema, newFields); } } diff --git a/hudi-hadoop-mr/src/test/resources/log4j-surefire.properties b/hudi-hadoop-mr/src/test/resources/log4j-surefire.properties index c03e808cca1f8..fe87e2deeb071 100644 --- a/hudi-hadoop-mr/src/test/resources/log4j-surefire.properties +++ b/hudi-hadoop-mr/src/test/resources/log4j-surefire.properties @@ -19,9 +19,9 @@ log4j.rootLogger=WARN, CONSOLE log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java index d7280402d2d5d..581cce954a53c 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java @@ -18,14 +18,15 @@ package org.apache.hudi.integ.testsuite.configuration; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.integ.testsuite.reader.DeltaInputType; import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.hadoop.conf.Configuration; + import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; @@ -69,6 +70,7 @@ public static class Config { public static final String TYPE = "type"; public static final String NODE_NAME = "name"; public static final String DEPENDENCIES = "deps"; + public static final String NO_DEPENDENCY_VALUE = "none"; public static final String CHILDREN = "children"; public static final String HIVE_QUERIES = "hive_queries"; public static final String HIVE_PROPERTIES = "hive_props"; diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java index 999bc43661894..789d7e3423466 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/DagUtils.java @@ -18,25 +18,28 @@ package org.apache.hudi.integ.testsuite.dag; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.JsonToken; -import com.fasterxml.jackson.databind.DeserializationContext; -import com.fasterxml.jackson.databind.JsonDeserializer; -import com.fasterxml.jackson.databind.JsonSerializer; -import com.fasterxml.jackson.databind.SerializerProvider; -import com.fasterxml.jackson.databind.module.SimpleModule; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; +import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.JsonSerializer; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.databind.node.ObjectNode; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator.Feature; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; @@ -48,8 +51,9 @@ import java.util.Map; import java.util.Map.Entry; import java.util.stream.Collectors; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; + +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.CONFIG_NAME; +import static org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config.NO_DEPENDENCY_VALUE; /** * Utility class to SerDe workflow dag. @@ -121,7 +125,12 @@ public static String convertDagToYaml(WorkflowDag dag) throws IOException { final ObjectMapper yamlWriter = new ObjectMapper(new YAMLFactory().disable(Feature.WRITE_DOC_START_MARKER) .enable(Feature.MINIMIZE_QUOTES).enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES)); JsonNode yamlNode = MAPPER.createObjectNode(); - convertDagToYaml(yamlNode, dag.getNodeList()); + ((ObjectNode) yamlNode).put(DAG_NAME, dag.getDagName()); + ((ObjectNode) yamlNode).put(DAG_ROUNDS, dag.getRounds()); + ((ObjectNode) yamlNode).put(DAG_INTERMITTENT_DELAY_MINS, dag.getIntermittentDelayMins()); + JsonNode dagContentNode = MAPPER.createObjectNode(); + convertDagToYaml(dagContentNode, dag.getNodeList()); + ((ObjectNode) yamlNode).put(DAG_CONTENT, dagContentNode); return yamlWriter.writerWithDefaultPrettyPrinter().writeValueAsString(yamlNode); } @@ -179,7 +188,7 @@ private static JsonNode convertDagNodeToJsonNode(DagNode node) throws IOExceptio private static Map convertJsonNodeToMap(JsonNode node) { Map configsMap = new HashMap<>(); - Iterator> itr = node.get(DeltaConfig.Config.CONFIG_NAME).fields(); + Iterator> itr = node.get(CONFIG_NAME).fields(); while (itr.hasNext()) { Entry entry = itr.next(); switch (entry.getKey()) { @@ -257,9 +266,14 @@ private static JsonNode createJsonNode(DagNode node, String type) throws IOExcep break; } } - ((ObjectNode) jsonNode).put(DeltaConfig.Config.CONFIG_NAME, configNode); + ((ObjectNode) jsonNode).put(CONFIG_NAME, configNode); ((ObjectNode) jsonNode).put(DeltaConfig.Config.TYPE, type); - ((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, getDependencyNames(node)); + String dependencyNames = getDependencyNames(node); + if (StringUtils.isNullOrEmpty(dependencyNames) || "\"\"".equals(dependencyNames)) { + // Set "none" if there is no dependency + dependencyNames = NO_DEPENDENCY_VALUE; + } + ((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, dependencyNames); return jsonNode; } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java index de58bf6a1f205..b5c661cb085f6 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java @@ -81,7 +81,7 @@ public void execute(ExecutionContext context, int curItrCount) throws Exception SparkSession session = SparkSession.builder().sparkContext(context.getJsc().sc()).getOrCreate(); // todo: Fix partitioning schemes. For now, assumes data based partitioning. String inputPath = context.getHoodieTestSuiteWriter().getCfg().inputBasePath + "/*/*"; - log.warn("Validation using data from input path " + inputPath); + log.info("Validation using data from input path " + inputPath); // listing batches to be validated String inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; if (log.isDebugEnabled()) { @@ -166,7 +166,7 @@ private Dataset getInputDf(ExecutionContext context, SparkSession session, ExpressionEncoder encoder = getEncoder(inputDf.schema()); return inputDf.groupByKey( (MapFunction) value -> - value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField), Encoders.STRING()) + (partitionPathField.isEmpty() ? value.getAs(recordKeyField) : (value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField))), Encoders.STRING()) .reduceGroups((ReduceFunction) (v1, v2) -> { int ts1 = v1.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); int ts2 = v2.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java index cc293ea470164..358abb36f9cdc 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java @@ -20,6 +20,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; import org.apache.spark.sql.Dataset; @@ -48,8 +49,8 @@ public Logger getLogger() { @Override public Dataset getDatasetToValidate(SparkSession session, ExecutionContext context, StructType inputSchema) { - String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*"; - log.info("Validate data in target hudi path " + hudiPath); + String partitionPathField = context.getWriterContext().getProps().getString(DataSourceWriteOptions.PARTITIONPATH_FIELD().key()); + String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + (partitionPathField.isEmpty() ? "/" : "/*/*/*"); Dataset hudiDf = session.read().option(HoodieMetadataConfig.ENABLE.key(), String.valueOf(config.isEnableMetadataValidate())) .format("hudi").load(hudiPath); return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala index 9ead7f290a06e..6352106326930 100644 --- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/SparkDataSourceContinuousIngest.scala @@ -64,16 +64,20 @@ class SparkDataSourceContinuousIngest(val spark: SparkSession, val conf: Configu } } - orderedBatch.foreach(entry => { - log.info("Consuming from batch " + entry) - val pathToConsume = new Path(sourcePath.toString + "/" + entry.getPath.getName) - val df = spark.read.format(sourceFormat).load(pathToConsume.toString) + if (orderedBatch.isEmpty) { + log.info("All batches have been consumed. Exiting.") + } else { + orderedBatch.foreach(entry => { + log.info("Consuming from batch " + entry) + val pathToConsume = new Path(sourcePath.toString + "/" + entry.getPath.getName) + val df = spark.read.format(sourceFormat).load(pathToConsume.toString) - df.write.format("hudi").options(hudiOptions).mode(SaveMode.Append).save(hudiBasePath.toString) - writeToFile(checkpointFile, entry.getPath.getName, checkPointFs) - log.info("Completed batch " + entry + ". Moving to next batch. Sleeping for " + minSyncIntervalSeconds + " secs before next batch") - Thread.sleep(minSyncIntervalSeconds * 1000) - }) + df.write.format("hudi").options(hudiOptions).mode(SaveMode.Append).save(hudiBasePath.toString) + writeToFile(checkpointFile, entry.getPath.getName, checkPointFs) + log.info("Completed batch " + entry + ". Moving to next batch. Sleeping for " + minSyncIntervalSeconds + " secs before next batch") + Thread.sleep(minSyncIntervalSeconds * 1000) + }) + } } def fetchListOfFilesToConsume(fs: FileSystem, basePath: Path, pathFilter: PathFilter): Array[FileStatus] = { diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java index 4a148da7954ac..e6eb036e9b9f5 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/TestDFSHoodieTestSuiteWriterAdapter.java @@ -18,16 +18,6 @@ package org.apache.hudi.integ.testsuite; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.when; - -import java.io.IOException; -import java.util.Iterator; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig; @@ -44,14 +34,27 @@ import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.mockito.Mockito; +import java.io.IOException; +import java.util.Iterator; + +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.when; + /** * Unit test against DeltaWriterAdapter, by testing writing DFS files. */ @@ -102,6 +105,8 @@ public void testDFSOneFileWrite() throws IOException { } @Test + @Disabled + // TODO(HUDI-3668): Fix this test public void testDFSTwoFilesWriteWithRollover() throws IOException { DeltaInputWriter mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class); @@ -122,6 +127,8 @@ public void testDFSTwoFilesWriteWithRollover() throws IOException { } @Test + @Disabled + // TODO(HUDI-3668): Fix this test public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException { DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO, new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath, diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestDeleteConverter.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestDeleteConverter.java index adaa7e9d446be..86e117197669a 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestDeleteConverter.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/converter/TestDeleteConverter.java @@ -76,8 +76,7 @@ public void testGenerateDeleteRecordsFromInputRecords() throws Exception { .collectAsMap(); List deleteRecords = outputRDD.collect(); deleteRecords.stream().forEach(updateRecord -> { - GenericRecord inputRecord = inputRecords.get(updateRecord.get("_row_key").toString()); - assertTrue((boolean)inputRecord.get(DEFAULT_HOODIE_IS_DELETED_COL)); + assertTrue((boolean) updateRecord.get(DEFAULT_HOODIE_IS_DELETED_COL)); }); } } diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java index 5267e8876fe83..d5f2af2094723 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/dag/TestDagUtils.java @@ -18,17 +18,20 @@ package org.apache.hudi.integ.testsuite.dag; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.util.ArrayList; -import java.util.List; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.nodes.DagNode; import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + /** * A utility class for DAG test. */ @@ -44,6 +47,8 @@ public void testConvertDagToYaml() throws Exception { } @Test + @Disabled + // TODO(HUDI-3668): Fix this test public void testConvertDagToYamlHiveQuery() throws Exception { WorkflowDag dag = new HiveSyncDagGenerator().build(); DagNode insert1 = (DagNode) dag.getNodeList().get(0); diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java index c31a7d67395a9..1959620aeb355 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/generator/TestGenericRecordPayloadEstimator.java @@ -18,13 +18,14 @@ package org.apache.hudi.integ.testsuite.generator; -import static junit.framework.TestCase.assertEquals; - -import org.apache.avro.Schema; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.avro.Schema; import org.junit.jupiter.api.Test; +import static junit.framework.TestCase.assertEquals; + /** * Unit test for {@link GenericRecordFullPayloadSizeEstimator}. */ @@ -41,8 +42,8 @@ public void testSimpleSchemaSize() throws Exception { GenericRecordFullPayloadSizeEstimator estimator = new GenericRecordFullPayloadSizeEstimator(schema); Pair estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields(); - assertEquals(estimateAndNumComplexFields.getRight().intValue(), 0); - assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 156); + assertEquals(0, estimateAndNumComplexFields.getRight().intValue()); + assertEquals(157, estimateAndNumComplexFields.getLeft().intValue()); } @Test diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java index 62c1d851e2108..7fae555068b21 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/job/TestHoodieTestSuiteJob.java @@ -47,6 +47,7 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -59,6 +60,8 @@ /** * Unit test against {@link HoodieTestSuiteJob}. */ +@Disabled +// TODO(HUDI-3668): Fix this test public class TestHoodieTestSuiteJob extends UtilitiesTestBase { private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with useDeltaStreamer={0}, tableType={1}"; diff --git a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java index 1caf8f80f51ac..a5b6072029f06 100644 --- a/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java +++ b/hudi-integ-test/src/test/java/org/apache/hudi/integ/testsuite/reader/TestDFSHoodieDatasetInputReader.java @@ -18,13 +18,6 @@ package org.apache.hudi.integ.testsuite.reader; -import static junit.framework.TestCase.assertEquals; -import static junit.framework.TestCase.assertTrue; - -import java.util.HashSet; -import java.util.List; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; @@ -34,13 +27,23 @@ import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import java.util.HashSet; +import java.util.List; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; + /** * Unit test for {@link DFSHoodieDatasetInputReader}. */ @@ -68,6 +71,8 @@ public void teardown() throws Exception { } @Test + @Disabled + // TODO(HUDI-3668): Fix this test public void testSimpleHoodieDatasetReader() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); diff --git a/hudi-integ-test/src/test/resources/log4j-surefire.properties b/hudi-integ-test/src/test/resources/log4j-surefire.properties index c03e808cca1f8..fe87e2deeb071 100644 --- a/hudi-integ-test/src/test/resources/log4j-surefire.properties +++ b/hudi-integ-test/src/test/resources/log4j-surefire.properties @@ -19,9 +19,9 @@ log4j.rootLogger=WARN, CONSOLE log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-kafka-connect/README.md b/hudi-kafka-connect/README.md index a5784139bcdd2..f80e2c9fe1627 100644 --- a/hudi-kafka-connect/README.md +++ b/hudi-kafka-connect/README.md @@ -48,8 +48,8 @@ $CONFLUENT_DIR/bin/confluent-hub install confluentinc/kafka-connect-hdfs:10.1.0 cp -r $CONFLUENT_DIR/share/confluent-hub-components/confluentinc-kafka-connect-hdfs/* /usr/local/share/kafka/plugins/ ``` -Now, build the packaged jar that contains all the hudi classes, including the Hudi Kafka Connector. And copy it -to the plugin path that contains all the other jars (`/usr/local/share/kafka/plugins/lib`) +Now, build the packaged jar that contains all the hudi classes, including the Hudi Kafka Connector. And copy it to the +plugin path that contains all the other jars (`/usr/local/share/kafka/plugins/lib`) ```bash cd $HUDI_DIR @@ -58,8 +58,20 @@ mkdir -p /usr/local/share/kafka/plugins/lib cp $HUDI_DIR/packaging/hudi-kafka-connect-bundle/target/hudi-kafka-connect-bundle-0.11.0-SNAPSHOT.jar /usr/local/share/kafka/plugins/lib ``` -Set up a Kafka broker locally. Download the latest apache kafka from [here](https://kafka.apache.org/downloads). -Once downloaded and built, run the Zookeeper server and Kafka server using the command line tools. +If the Hudi Sink Connector writes to a target Hudi table on [Amazon S3](https://aws.amazon.com/s3/), you need two +additional jars, `hadoop-aws-2.10.1.jar` and `aws-java-sdk-bundle-1.11.271.jar`, in the `plugins/lib` folder. You may +download them using the following commands. Note that, when you specify the target table path on S3, you need to use +`s3a://` prefix. + +```bash +cd /usr/local/share/kafka/plugins/lib +wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.271/aws-java-sdk-bundle-1.11.271.jar +wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.10.1/hadoop-aws-2.10.1.jar +``` + +Set up a Kafka broker locally. Download the latest apache kafka from [here](https://kafka.apache.org/downloads). Once +downloaded and built, run the Zookeeper server and Kafka server using the command line tools. + ```bash export KAFKA_HOME=/path/to/kafka_install_dir cd $KAFKA_HOME @@ -67,6 +79,7 @@ cd $KAFKA_HOME ./bin/zookeeper-server-start.sh ./config/zookeeper.properties ./bin/kafka-server-start.sh ./config/server.properties ``` + Wait until the kafka cluster is up and running. ### 2 - Set up the schema registry diff --git a/hudi-kafka-connect/src/test/resources/log4j-surefire.properties b/hudi-kafka-connect/src/test/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-kafka-connect/src/test/resources/log4j-surefire.properties +++ b/hudi-kafka-connect/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/DataSourceInternalWriterHelper.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/DataSourceInternalWriterHelper.java index 7ca62d1520957..5aa82642de62e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/DataSourceInternalWriterHelper.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/internal/DataSourceInternalWriterHelper.java @@ -66,7 +66,7 @@ public DataSourceInternalWriterHelper(String instantTime, HoodieWriteConfig writ writeClient.startCommitWithTime(instantTime); this.metaClient = HoodieTableMetaClient.builder().setConf(configuration).setBasePath(writeConfig.getBasePath()).build(); - this.metaClient.validateTableProperties(writeConfig.getProps(), WriteOperationType.BULK_INSERT); + this.metaClient.validateTableProperties(writeConfig.getProps()); this.hoodieTable = HoodieSparkTable.create(writeConfig, new HoodieSparkEngineContext(new JavaSparkContext(sparkSession.sparkContext())), metaClient); } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister index cc8fb0492affd..556b0feef1cdb 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister +++ b/hudi-spark-datasource/hudi-spark-common/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -17,4 +17,4 @@ org.apache.hudi.DefaultSource -org.apache.spark.sql.execution.datasources.parquet.SparkHoodieParquetFileFormat \ No newline at end of file +org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala index 525292da6da98..c57f46a7b6639 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala @@ -20,13 +20,14 @@ package org.apache.hudi import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path - -import org.apache.hudi.HoodieBaseRelation.createBaseFileReader +import org.apache.hudi.common.model.HoodieFileFormat import org.apache.hudi.common.table.HoodieTableMetaClient - +import org.apache.hudi.hadoop.HoodieROTablePathFilter import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat +import org.apache.spark.sql.hive.orc.OrcFileFormat import org.apache.spark.sql.sources.{BaseRelation, Filter} import org.apache.spark.sql.types.StructType @@ -53,7 +54,8 @@ class BaseFileOnlyRelation(sqlContext: SQLContext, override type FileSplit = HoodieBaseFileSplit - override lazy val mandatoryColumns: Seq[String] = + override lazy val mandatoryFields: Seq[String] = + // TODO reconcile, record's key shouldn't be mandatory for base-file only relation Seq(recordKeyField) override def imbueConfigs(sqlContext: SQLContext): Unit = { @@ -63,14 +65,14 @@ class BaseFileOnlyRelation(sqlContext: SQLContext, protected override def composeRDD(fileSplits: Seq[HoodieBaseFileSplit], partitionSchema: StructType, - tableSchema: HoodieTableSchema, + dataSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, filters: Array[Filter]): HoodieUnsafeRDD = { val baseFileReader = createBaseFileReader( spark = sparkSession, partitionSchema = partitionSchema, - tableSchema = tableSchema, + dataSchema = dataSchema, requiredSchema = requiredSchema, filters = filters, options = optParams, @@ -84,21 +86,90 @@ class BaseFileOnlyRelation(sqlContext: SQLContext, protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[HoodieBaseFileSplit] = { val partitions = listLatestBaseFiles(globPaths, partitionFilters, dataFilters) - val fileSplits = partitions.values.toSeq.flatMap { files => - files.flatMap { file => - // TODO move to adapter - // TODO fix, currently assuming parquet as underlying format - HoodieDataSourceHelper.splitFiles( - sparkSession = sparkSession, - file = file, - // TODO clarify why this is required - partitionValues = getPartitionColumnsAsInternalRow(file) - ) + val fileSplits = partitions.values.toSeq + .flatMap { files => + files.flatMap { file => + // TODO fix, currently assuming parquet as underlying format + HoodieDataSourceHelper.splitFiles( + sparkSession = sparkSession, + file = file, + partitionValues = getPartitionColumnsAsInternalRow(file) + ) + } } - } + // NOTE: It's important to order the splits in the reverse order of their + // size so that we can subsequently bucket them in an efficient manner + .sortBy(_.length)(implicitly[Ordering[Long]].reverse) val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes - sparkAdapter.getFilePartitions(sparkSession, fileSplits, maxSplitBytes).map(HoodieBaseFileSplit.apply) + sparkAdapter.getFilePartitions(sparkSession, fileSplits, maxSplitBytes) + .map(HoodieBaseFileSplit.apply) + } + + /** + * NOTE: We have to fallback to [[HadoopFsRelation]] to make sure that all of the Spark optimizations could be + * equally applied to Hudi tables, since some of those are predicated on the usage of [[HadoopFsRelation]], + * and won't be applicable in case of us using our own custom relations (one of such optimizations is [[SchemaPruning]] + * rule; you can find more details in HUDI-3896) + */ + def toHadoopFsRelation: HadoopFsRelation = { + val (tableFileFormat, formatClassName) = + metaClient.getTableConfig.getBaseFileFormat match { + case HoodieFileFormat.ORC => (new OrcFileFormat, "orc") + case HoodieFileFormat.PARQUET => + // We're delegating to Spark to append partition values to every row only in cases + // when these corresponding partition-values are not persisted w/in the data file itself + val parquetFileFormat = sparkAdapter.createHoodieParquetFileFormat(shouldExtractPartitionValuesFromPartitionPath).get + (parquetFileFormat, HoodieParquetFileFormat.FILE_FORMAT_ID) + } + + if (globPaths.isEmpty) { + // NOTE: There are currently 2 ways partition values could be fetched: + // - Source columns (producing the values used for physical partitioning) will be read + // from the data file + // - Values parsed from the actual partition path would be appended to the final dataset + // + // In the former case, we don't need to provide the partition-schema to the relation, + // therefore we simply stub it w/ empty schema and use full table-schema as the one being + // read from the data file. + // + // In the latter, we have to specify proper partition schema as well as "data"-schema, essentially + // being a table-schema with all partition columns stripped out + val (partitionSchema, dataSchema) = if (shouldExtractPartitionValuesFromPartitionPath) { + (fileIndex.partitionSchema, fileIndex.dataSchema) + } else { + (StructType(Nil), tableStructSchema) + } + + HadoopFsRelation( + location = fileIndex, + partitionSchema = partitionSchema, + dataSchema = dataSchema, + bucketSpec = None, + fileFormat = tableFileFormat, + optParams)(sparkSession) + } else { + val readPathsStr = optParams.get(DataSourceReadOptions.READ_PATHS.key) + val extraReadPaths = readPathsStr.map(p => p.split(",").toSeq).getOrElse(Seq()) + + DataSource.apply( + sparkSession = sparkSession, + paths = extraReadPaths, + userSpecifiedSchema = userSchema, + className = formatClassName, + // Since we're reading the table as just collection of files we have to make sure + // we only read the latest version of every Hudi's file-group, which might be compacted, clustered, etc. + // while keeping previous versions of the files around as well. + // + // We rely on [[HoodieROTablePathFilter]], to do proper filtering to assure that + options = optParams ++ Map( + "mapreduce.input.pathFilter.class" -> classOf[HoodieROTablePathFilter].getName + ), + partitionColumns = partitionColumns + ) + .resolveRelation() + .asInstanceOf[HadoopFsRelation] + } } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala index 74393234120e5..b1e03f86ff807 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala @@ -26,6 +26,7 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.table.view.FileSystemViewStorageConfig +import org.apache.hudi.common.util.ValidationUtils.checkState import org.apache.hudi.common.util.hash.ColumnIndexID import org.apache.hudi.data.HoodieJavaRDD import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata, HoodieTableMetadataUtil, MetadataPartitionType} @@ -68,7 +69,7 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport { if (targetColumns.nonEmpty) { readColumnStatsIndexForColumnsInternal(spark, targetColumns, metadataConfig, tableBasePath) } else { - readFullColumnStatsIndexInternal(spark, tableBasePath) + readFullColumnStatsIndexInternal(spark, metadataConfig, tableBasePath) } } @@ -113,11 +114,11 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport { * * @param spark Spark session ref * @param colStatsDF [[DataFrame]] bearing raw Column Stats Index table - * @param targetColumns target columns to be included into the final table + * @param queryColumns target columns to be included into the final table * @param tableSchema schema of the source data table * @return reshaped table according to the format outlined above */ - def transposeColumnStatsIndex(spark: SparkSession, colStatsDF: DataFrame, targetColumns: Seq[String], tableSchema: StructType): DataFrame = { + def transposeColumnStatsIndex(spark: SparkSession, colStatsDF: DataFrame, queryColumns: Seq[String], tableSchema: StructType): DataFrame = { val colStatsSchema = colStatsDF.schema val colStatsSchemaOrdinalsMap = colStatsSchema.fields.zipWithIndex.map({ case (field, ordinal) => (field.name, ordinal) @@ -125,10 +126,6 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport { val tableSchemaFieldMap = tableSchema.fields.map(f => (f.name, f)).toMap - // NOTE: We're sorting the columns to make sure final index schema matches layout - // of the transposed table - val sortedColumns = TreeSet(targetColumns: _*) - val colNameOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_COLUMN_NAME) val minValueOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_MIN_VALUE) val maxValueOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_MAX_VALUE) @@ -136,36 +133,69 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport { val nullCountOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_NULL_COUNT) val valueCountOrdinal = colStatsSchemaOrdinalsMap(HoodieMetadataPayload.COLUMN_STATS_FIELD_VALUE_COUNT) - val transposedRDD = colStatsDF.rdd - .filter(row => sortedColumns.contains(row.getString(colNameOrdinal))) - .map { row => - val (minValue, _) = tryUnpackNonNullVal(row.getAs[Row](minValueOrdinal)) - val (maxValue, _) = tryUnpackNonNullVal(row.getAs[Row](maxValueOrdinal)) - - val colName = row.getString(colNameOrdinal) - val colType = tableSchemaFieldMap(colName).dataType + // NOTE: We have to collect list of indexed columns to make sure we properly align the rows + // w/in the transposed dataset: since some files might not have all of the columns indexed + // either due to the Column Stats Index config changes, schema evolution, etc, we have + // to make sure that all of the rows w/in transposed data-frame are properly padded (with null + // values) for such file-column combinations + val indexedColumns: Seq[String] = colStatsDF.rdd.map(row => row.getString(colNameOrdinal)).distinct().collect() - val rowValsSeq = row.toSeq.toArray - - rowValsSeq(minValueOrdinal) = deserialize(minValue, colType) - rowValsSeq(maxValueOrdinal) = deserialize(maxValue, colType) + // NOTE: We're sorting the columns to make sure final index schema matches layout + // of the transposed table + val sortedTargetColumns = TreeSet(queryColumns.intersect(indexedColumns): _*) - Row(rowValsSeq:_*) + val transposedRDD = colStatsDF.rdd + .filter(row => sortedTargetColumns.contains(row.getString(colNameOrdinal))) + .map { row => + if (row.isNullAt(minValueOrdinal) && row.isNullAt(maxValueOrdinal)) { + // Corresponding row could be null in either of the 2 cases + // - Column contains only null values (in that case both min/max have to be nulls) + // - This is a stubbed Column Stats record (used as a tombstone) + row + } else { + val minValueStruct = row.getAs[Row](minValueOrdinal) + val maxValueStruct = row.getAs[Row](maxValueOrdinal) + + checkState(minValueStruct != null && maxValueStruct != null, "Invalid Column Stats record: either both min/max have to be null, or both have to be non-null") + + val colName = row.getString(colNameOrdinal) + val colType = tableSchemaFieldMap(colName).dataType + + val (minValue, _) = tryUnpackNonNullVal(minValueStruct) + val (maxValue, _) = tryUnpackNonNullVal(maxValueStruct) + val rowValsSeq = row.toSeq.toArray + // Update min-/max-value structs w/ unwrapped values in-place + rowValsSeq(minValueOrdinal) = deserialize(minValue, colType) + rowValsSeq(maxValueOrdinal) = deserialize(maxValue, colType) + + Row(rowValsSeq: _*) + } } .groupBy(r => r.getString(fileNameOrdinal)) .foldByKey(Seq[Row]()) { - case (_, columnRows) => + case (_, columnRowsSeq) => // Rows seq is always non-empty (otherwise it won't be grouped into) - val fileName = columnRows.head.get(fileNameOrdinal) - val valueCount = columnRows.head.get(valueCountOrdinal) - - val coalescedRowValuesSeq = columnRows.toSeq - // NOTE: It's crucial to maintain appropriate ordering of the columns - // matching table layout - .sortBy(_.getString(colNameOrdinal)) - .foldLeft(Seq[Any](fileName, valueCount)) { - case (acc, columnRow) => - acc ++ Seq(minValueOrdinal, maxValueOrdinal, nullCountOrdinal).map(ord => columnRow.get(ord)) + val fileName = columnRowsSeq.head.get(fileNameOrdinal) + val valueCount = columnRowsSeq.head.get(valueCountOrdinal) + + // To properly align individual rows (corresponding to a file) w/in the transposed projection, we need + // to align existing column-stats for individual file with the list of expected ones for the + // whole transposed projection (a superset of all files) + val columnRowsMap = columnRowsSeq.map(row => (row.getString(colNameOrdinal), row)).toMap + val alignedColumnRowsSeq = sortedTargetColumns.toSeq.map(columnRowsMap.get) + + val coalescedRowValuesSeq = + alignedColumnRowsSeq.foldLeft(Seq[Any](fileName, valueCount)) { + case (acc, opt) => + opt match { + case Some(columnStatsRow) => + acc ++ Seq(minValueOrdinal, maxValueOrdinal, nullCountOrdinal).map(ord => columnStatsRow.get(ord)) + case None => + // NOTE: Since we're assuming missing column to essentially contain exclusively + // null values, we set null-count to be equal to value-count (this behavior is + // consistent with reading non-existent columns from Parquet) + acc ++ Seq(null, null, valueCount) + } } Seq(Row(coalescedRowValuesSeq:_*)) @@ -176,15 +206,16 @@ trait ColumnStatsIndexSupport extends SparkAdapterSupport { // NOTE: It's crucial to maintain appropriate ordering of the columns // matching table layout: hence, we cherry-pick individual columns // instead of simply filtering in the ones we're interested in the schema - val indexSchema = composeIndexSchema(sortedColumns.toSeq, tableSchema) + val indexSchema = composeIndexSchema(sortedTargetColumns.toSeq, tableSchema) spark.createDataFrame(transposedRDD, indexSchema) } - private def readFullColumnStatsIndexInternal(spark: SparkSession, tableBasePath: String) = { + private def readFullColumnStatsIndexInternal(spark: SparkSession, metadataConfig: HoodieMetadataConfig, tableBasePath: String): DataFrame = { val metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(tableBasePath) // Read Metadata Table's Column Stats Index into Spark's [[DataFrame]] spark.read.format("org.apache.hudi") + .options(metadataConfig.getProps.asScala) .load(s"$metadataTablePath/${MetadataPartitionType.COLUMN_STATS.getPartitionPath}") } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 432988962dea1..0d4c7cf184ddc 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -18,14 +18,16 @@ package org.apache.hudi import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, QUERY_TYPE_SNAPSHOT_OPT_VAL} +import org.apache.hudi.HoodieConversionUtils.toScalaOption import org.apache.hudi.common.config.{ConfigProperty, HoodieConfig} import org.apache.hudi.common.fs.ConsistencyGuardConfig import org.apache.hudi.common.model.{HoodieTableType, WriteOperationType} import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.common.util.Option +import org.apache.hudi.common.util.ValidationUtils.checkState import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig} import org.apache.hudi.hive.util.ConfigUtils -import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool, MultiPartKeysValueExtractor, NonPartitionedExtractor, SlashEncodedDayPartitionValueExtractor} +import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} import org.apache.hudi.keygen.constant.KeyGeneratorOptions import org.apache.hudi.keygen.{ComplexKeyGenerator, CustomKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.sync.common.HoodieSyncConfig @@ -45,6 +47,7 @@ import scala.language.implicitConversions * Options supported for reading hoodie tables. */ object DataSourceReadOptions { + import DataSourceOptionsHelper._ val QUERY_TYPE_SNAPSHOT_OPT_VAL = "snapshot" val QUERY_TYPE_READ_OPTIMIZED_OPT_VAL = "read_optimized" @@ -119,11 +122,20 @@ object DataSourceReadOptions { val ENABLE_DATA_SKIPPING: ConfigProperty[Boolean] = ConfigProperty .key("hoodie.enable.data.skipping") - .defaultValue(true) + .defaultValue(false) .sinceVersion("0.10.0") .withDocumentation("Enables data-skipping allowing queries to leverage indexes to reduce the search space by " + "skipping over files") + val EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH: ConfigProperty[Boolean] = + ConfigProperty.key("hoodie.datasource.read.extract.partition.values.from.path") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("When set to true, values for partition columns (partition values) will be extracted" + + " from physical partition path (default Spark behavior). When set to false partition values will be" + + " read from the data file (in Hudi partition columns are persisted by default)." + + " This config is a fallback allowing to preserve existing behavior, and should not be used otherwise.") + val INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.read.incr.fallback.fulltablescan.enable") .defaultValue("false") @@ -185,6 +197,8 @@ object DataSourceReadOptions { */ object DataSourceWriteOptions { + import DataSourceOptionsHelper._ + val BULK_INSERT_OPERATION_OPT_VAL = WriteOperationType.BULK_INSERT.value val INSERT_OPERATION_OPT_VAL = WriteOperationType.INSERT.value val UPSERT_OPERATION_OPT_VAL = WriteOperationType.UPSERT.value @@ -471,10 +485,7 @@ object DataSourceWriteOptions { .sinceVersion("0.9.0") .withDocumentation("This class is used by kafka client to deserialize the records") - val DROP_PARTITION_COLUMNS: ConfigProperty[Boolean] = ConfigProperty - .key(HoodieTableConfig.DROP_PARTITION_COLUMNS.key()) - .defaultValue(HoodieTableConfig.DROP_PARTITION_COLUMNS.defaultValue().booleanValue()) - .withDocumentation(HoodieTableConfig.DROP_PARTITION_COLUMNS.doc()) + val DROP_PARTITION_COLUMNS: ConfigProperty[Boolean] = HoodieTableConfig.DROP_PARTITION_COLUMNS /** @deprecated Use {@link HIVE_ASSUME_DATE_PARTITION} and its methods instead */ @Deprecated @@ -774,4 +785,23 @@ object DataSourceOptionsHelper { override def apply (input: From): To = function (input) } } + + implicit def convert[T, U](prop: ConfigProperty[T])(implicit converter: T => U): ConfigProperty[U] = { + checkState(prop.hasDefaultValue) + var newProp: ConfigProperty[U] = ConfigProperty.key(prop.key()) + .defaultValue(converter(prop.defaultValue())) + .withDocumentation(prop.doc()) + .withAlternatives(prop.getAlternatives.asScala: _*) + + newProp = toScalaOption(prop.getSinceVersion) match { + case Some(version) => newProp.sinceVersion(version) + case None => newProp + } + newProp = toScalaOption(prop.getDeprecatedVersion) match { + case Some(version) => newProp.deprecatedAfter(version) + case None => newProp + } + + newProp + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala index 7550ff13fd5ea..c1229d55009e4 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala @@ -21,11 +21,13 @@ import org.apache.hadoop.fs.Path import org.apache.hudi.DataSourceReadOptions._ import org.apache.hudi.DataSourceWriteOptions.{BOOTSTRAP_OPERATION_OPT_VAL, OPERATION} import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord} +import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_READ} import org.apache.hudi.common.table.timeline.HoodieInstant import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.config.HoodieWriteConfig.SCHEMA_EVOLUTION_ENABLE import org.apache.hudi.exception.HoodieException +import org.apache.hudi.internal.schema.InternalSchema import org.apache.log4j.LogManager import org.apache.spark.sql.execution.streaming.{Sink, Source} import org.apache.spark.sql.hudi.streaming.HoodieStreamSource @@ -108,7 +110,7 @@ class DefaultSource extends RelationProvider case (COPY_ON_WRITE, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) | (COPY_ON_WRITE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) | (MERGE_ON_READ, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) => - new BaseFileOnlyRelation(sqlContext, metaClient, parameters, userSchema, globPaths) + resolveBaseFileOnlyRelation(sqlContext, globPaths, userSchema, metaClient, parameters) case (COPY_ON_WRITE, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) => new IncrementalRelation(sqlContext, parameters, userSchema, metaClient) @@ -141,7 +143,7 @@ class DefaultSource extends RelationProvider * * TODO: Revisit to return a concrete relation here when we support CREATE TABLE AS for Hudi with DataSource API. * That is the only case where Spark seems to actually need a relation to be returned here - * [[DataSource.writeAndRead()]] + * [[org.apache.spark.sql.execution.datasources.DataSource.writeAndRead()]] * * @param sqlContext Spark SQL Context * @param mode Mode for saving the DataFrame at the destination @@ -206,4 +208,32 @@ class DefaultSource extends RelationProvider parameters: Map[String, String]): Source = { new HoodieStreamSource(sqlContext, metadataPath, schema, parameters) } + + private def resolveBaseFileOnlyRelation(sqlContext: SQLContext, + globPaths: Seq[Path], + userSchema: Option[StructType], + metaClient: HoodieTableMetaClient, + optParams: Map[String, String]) = { + val baseRelation = new BaseFileOnlyRelation(sqlContext, metaClient, optParams, userSchema, globPaths) + val enableSchemaOnRead: Boolean = !tryFetchInternalSchema(metaClient).isEmptySchema + + // NOTE: We fallback to [[HadoopFsRelation]] in all of the cases except ones requiring usage of + // [[BaseFileOnlyRelation]] to function correctly. This is necessary to maintain performance parity w/ + // vanilla Spark, since some of the Spark optimizations are predicated on the using of [[HadoopFsRelation]]. + // + // You can check out HUDI-3896 for more details + if (enableSchemaOnRead) { + baseRelation + } else { + baseRelation.toHadoopFsRelation + } + } + + private def tryFetchInternalSchema(metaClient: HoodieTableMetaClient) = + try { + new TableSchemaResolver(metaClient).getTableInternalSchemaFromCommitMetadata + .orElse(InternalSchema.getEmptyInternalSchema) + } catch { + case _: Exception => InternalSchema.getEmptyInternalSchema + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala index c33c6dce6d0cd..4b7177f4d6326 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala @@ -19,14 +19,13 @@ package org.apache.hudi import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.hbase.io.hfile.CacheConfig import org.apache.hadoop.mapred.JobConf - -import org.apache.hudi.HoodieBaseRelation.getPartitionPath +import org.apache.hudi.HoodieBaseRelation.{convertToAvroSchema, createHFileReader, generateUnsafeProjection, getPartitionPath} import org.apache.hudi.HoodieConversionUtils.toScalaOption +import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.common.config.{HoodieMetadataConfig, SerializableConfiguration} import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord} @@ -38,13 +37,12 @@ import org.apache.hudi.common.util.ValidationUtils.checkState import org.apache.hudi.internal.schema.InternalSchema import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter import org.apache.hudi.io.storage.HoodieHFileReader - -import org.apache.spark.TaskContext import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} +import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionedFile, PartitioningUtils} import org.apache.spark.sql.hudi.HoodieSqlCommonUtils import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} @@ -52,12 +50,11 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{Row, SQLContext, SparkSession} import org.apache.spark.unsafe.types.UTF8String -import java.io.Closeable import java.net.URI - +import java.util.Locale import scala.collection.JavaConverters._ -import scala.util.Try import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} trait HoodieFileSplit {} @@ -78,7 +75,10 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, val metaClient: HoodieTableMetaClient, val optParams: Map[String, String], userSchema: Option[StructType]) - extends BaseRelation with PrunedFilteredScan with Logging with SparkAdapterSupport { + extends BaseRelation + with FileRelation + with PrunedFilteredScan + with Logging { type FileSplit <: HoodieFileSplit @@ -124,14 +124,17 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, protected lazy val (tableAvroSchema: Schema, internalSchema: InternalSchema) = { val schemaUtil = new TableSchemaResolver(metaClient) - val avroSchema = Try(schemaUtil.getTableAvroSchema).getOrElse( - // If there is no commit in the table, we can't get the schema - // t/h [[TableSchemaResolver]], fallback to the provided [[userSchema]] instead. - userSchema match { - case Some(s) => sparkAdapter.getAvroSchemaConverters.toAvroType(s, nullable = false, "record") - case _ => throw new IllegalArgumentException("User-provided schema is required in case the table is empty") - } - ) + val avroSchema = Try(schemaUtil.getTableAvroSchema) match { + case Success(schema) => schema + case Failure(e) => + logWarning("Failed to fetch schema from the table", e) + // If there is no commit in the table, we can't get the schema + // t/h [[TableSchemaResolver]], fallback to the provided [[userSchema]] instead. + userSchema match { + case Some(s) => convertToAvroSchema(s) + case _ => throw new IllegalArgumentException("User-provided schema is required in case the table is empty") + } + } // try to find internalSchema val internalSchemaFromMeta = try { schemaUtil.getTableInternalSchemaFromCommitMetadata.orElse(InternalSchema.getEmptyInternalSchema) @@ -146,10 +149,35 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, protected val partitionColumns: Array[String] = tableConfig.getPartitionFields.orElse(Array.empty) /** - * if true, need to deal with schema for creating file reader. + * Controls whether partition values (ie values of partition columns) should be + *

    + *
  1. Extracted from partition path and appended to individual rows read from the data file (we + * delegate this to Spark's [[ParquetFileFormat]])
  2. + *
  3. Read from the data-file as is (by default Hudi persists all columns including partition ones)
  4. + *
+ * + * This flag is only be relevant in conjunction with the usage of [["hoodie.datasource.write.drop.partition.columns"]] + * config, when Hudi will NOT be persisting partition columns in the data file, and therefore values for + * such partition columns (ie "partition values") will have to be parsed from the partition path, and appended + * to every row only in the fetched dataset. + * + * NOTE: Partition values extracted from partition path might be deviating from the values of the original + * partition columns: for ex, if originally as partition column was used column [[ts]] bearing epoch + * timestamp, which was used by [[TimestampBasedKeyGenerator]] to generate partition path of the format + * [["yyyy/mm/dd"]], appended partition value would bear the format verbatim as it was used in the + * partition path, meaning that string value of "2022/01/01" will be appended, and not its original + * representation */ - protected val dropPartitionColumnsWhenWrite: Boolean = - metaClient.getTableConfig.isDropPartitionColumns && partitionColumns.nonEmpty + protected val shouldExtractPartitionValuesFromPartitionPath: Boolean = { + // Controls whether partition columns (which are the source for the partition path values) should + // be omitted from persistence in the data files. On the read path it affects whether partition values (values + // of partition columns) will be read from the data file ot extracted from partition path + val shouldOmitPartitionColumns = metaClient.getTableConfig.shouldDropPartitionColumns && partitionColumns.nonEmpty + val shouldExtractPartitionValueFromPath = + optParams.getOrElse(DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key, + DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.defaultValue.toString).toBoolean + shouldOmitPartitionColumns || shouldExtractPartitionValueFromPath + } /** * NOTE: PLEASE READ THIS CAREFULLY @@ -170,7 +198,10 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, * * @VisibleInTests */ - val mandatoryColumns: Seq[String] + val mandatoryFields: Seq[String] + + protected def mandatoryRootFields: Seq[String] = + mandatoryFields.map(col => HoodieAvroUtils.getRootLevelFieldName(col)) protected def timeline: HoodieTimeline = // NOTE: We're including compaction here since it's not considering a "commit" operation @@ -198,19 +229,26 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, */ override final def needConversion: Boolean = false + override def inputFiles: Array[String] = fileIndex.allFiles.map(_.getPath.toUri.toString).toArray + /** * NOTE: DO NOT OVERRIDE THIS METHOD */ override final def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { - // NOTE: In case list of requested columns doesn't contain the Primary Key one, we + // NOTE: PLEAS READ CAREFULLY BEFORE MAKING CHANGES + // + // In case list of requested columns doesn't contain the Primary Key one, we // have to add it explicitly so that // - Merging could be performed correctly // - In case 0 columns are to be fetched (for ex, when doing {@code count()} on Spark's [[Dataset]], - // Spark still fetches all the rows to execute the query correctly + // Spark still fetches all the rows to execute the query correctly // - // It's okay to return columns that have not been requested by the caller, as those nevertheless will be - // filtered out upstream - val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns) + // *Appending* additional columns to the ones requested by the caller is not a problem, as those + // will be "projected out" by the caller's projection; + // + // (!!!) IT'S CRITICAL TO AVOID REORDERING OF THE REQUESTED COLUMNS AS THIS WILL BREAK THE UPSTREAM + // PROJECTION + val fetchedColumns: Array[String] = appendMandatoryRootFields(requiredColumns) val (requiredAvroSchema, requiredStructSchema, requiredInternalSchema) = HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns, internalSchema) @@ -220,39 +258,46 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, val fileSplits = collectFileSplits(partitionFilters, dataFilters) - val partitionSchema = if (dropPartitionColumnsWhenWrite) { - // when hoodie.datasource.write.drop.partition.columns is true, partition columns can't be persisted in - // data files. - StructType(partitionColumns.map(StructField(_, StringType))) - } else { - StructType(Nil) - } + val tableAvroSchemaStr = + if (internalSchema.isEmptySchema) tableAvroSchema.toString + else AvroInternalSchemaConverter.convert(internalSchema, tableAvroSchema.getName).toString - val tableSchema = HoodieTableSchema(tableStructSchema, if (internalSchema.isEmptySchema) tableAvroSchema.toString else AvroInternalSchemaConverter.convert(internalSchema, tableAvroSchema.getName).toString, internalSchema) - val dataSchema = if (dropPartitionColumnsWhenWrite) { - val dataStructType = StructType(tableStructSchema.filterNot(f => partitionColumns.contains(f.name))) - HoodieTableSchema( - dataStructType, - sparkAdapter.getAvroSchemaConverters.toAvroType(dataStructType, nullable = false, "record").toString() - ) - } else { - tableSchema - } - val requiredSchema = if (dropPartitionColumnsWhenWrite) { - val requiredStructType = StructType(requiredStructSchema.filterNot(f => partitionColumns.contains(f.name))) - HoodieTableSchema( - requiredStructType, - sparkAdapter.getAvroSchemaConverters.toAvroType(requiredStructType, nullable = false, "record").toString() - ) + val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchemaStr, internalSchema) + val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString, requiredInternalSchema) + + // Since schema requested by the caller might contain partition columns, we might need to + // prune it, removing all partition columns from it in case these columns are not persisted + // in the data files + // + // NOTE: This partition schema is only relevant to file reader to be able to embed + // values of partition columns (hereafter referred to as partition values) encoded into + // the partition path, and omitted from the data file, back into fetched rows; + // Note that, by default, partition columns are not omitted therefore specifying + // partition schema for reader is not required + val (partitionSchema, dataSchema, prunedRequiredSchema) = + tryPrunePartitionColumns(tableSchema, requiredSchema) + + if (fileSplits.isEmpty) { + sparkSession.sparkContext.emptyRDD } else { - HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString, requiredInternalSchema) + val rdd = composeRDD(fileSplits, partitionSchema, dataSchema, prunedRequiredSchema, filters) + + // NOTE: In case when partition columns have been pruned from the required schema, we have to project + // the rows from the pruned schema back into the one expected by the caller + val projectedRDD = if (prunedRequiredSchema.structTypeSchema != requiredSchema.structTypeSchema) { + rdd.mapPartitions { it => + val fullPrunedSchema = StructType(prunedRequiredSchema.structTypeSchema.fields ++ partitionSchema.fields) + val unsafeProjection = generateUnsafeProjection(fullPrunedSchema, requiredSchema.structTypeSchema) + it.map(unsafeProjection) + } + } else { + rdd + } + + // Here we rely on a type erasure, to workaround inherited API restriction and pass [[RDD[InternalRow]]] back as [[RDD[Row]]] + // Please check [[needConversion]] scala-doc for more details + projectedRDD.asInstanceOf[RDD[Row]] } - // Here we rely on a type erasure, to workaround inherited API restriction and pass [[RDD[InternalRow]]] back as [[RDD[Row]]] - // Please check [[needConversion]] scala-doc for more details - if (fileSplits.nonEmpty) - composeRDD(fileSplits, partitionSchema, dataSchema, requiredSchema, filters).asInstanceOf[RDD[Row]] - else - sparkSession.sparkContext.emptyRDD } /** @@ -260,14 +305,14 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, * * @param fileSplits file splits to be handled by the RDD * @param partitionSchema target table's partition schema - * @param tableSchema target table's schema + * @param dataSchema target table's data files' schema * @param requiredSchema projected schema required by the reader * @param filters data filters to be applied * @return instance of RDD (implementing [[HoodieUnsafeRDD]]) */ protected def composeRDD(fileSplits: Seq[FileSplit], partitionSchema: StructType, - tableSchema: HoodieTableSchema, + dataSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, filters: Array[Filter]): HoodieUnsafeRDD @@ -319,17 +364,12 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, !SubqueryExpression.hasSubquery(condition) } - protected final def appendMandatoryColumns(requestedColumns: Array[String]): Array[String] = { - if (dropPartitionColumnsWhenWrite) { - if (requestedColumns.isEmpty) { - mandatoryColumns.toArray - } else { - requestedColumns - } - } else { - val missing = mandatoryColumns.filter(col => !requestedColumns.contains(col)) - requestedColumns ++ missing - } + protected final def appendMandatoryRootFields(requestedColumns: Array[String]): Array[String] = { + // For a nested field in mandatory columns, we should first get the root-level field, and then + // check for any missing column, as the requestedColumns should only contain root-level fields + // We should only append root-level field as well + val missing = mandatoryRootFields.filter(rootField => !requestedColumns.contains(rootField)) + requestedColumns ++ missing } protected def getTableState: HoodieTableState = { @@ -359,7 +399,7 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, protected def getPartitionColumnsAsInternalRow(file: FileStatus): InternalRow = { try { val tableConfig = metaClient.getTableConfig - if (dropPartitionColumnsWhenWrite) { + if (shouldExtractPartitionValuesFromPartitionPath) { val relativePath = new URI(metaClient.getBasePath).relativize(new URI(file.getPath.getParent.toString)).toString val hiveStylePartitioningEnabled = tableConfig.getHiveStylePartitioningEnable.toBoolean if (hiveStylePartitioningEnabled) { @@ -383,52 +423,52 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, InternalRow.empty } } -} -object HoodieBaseRelation { - - def getPartitionPath(fileStatus: FileStatus): Path = - fileStatus.getPath.getParent + protected def getColName(f: StructField): String = { + if (sparkSession.sessionState.conf.caseSensitiveAnalysis) { + f.name + } else { + f.name.toLowerCase(Locale.ROOT) + } + } /** * Returns file-reader routine accepting [[PartitionedFile]] and returning an [[Iterator]] * over [[InternalRow]] */ - def createBaseFileReader(spark: SparkSession, - partitionSchema: StructType, - tableSchema: HoodieTableSchema, - requiredSchema: HoodieTableSchema, - filters: Seq[Filter], - options: Map[String, String], - hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + protected def createBaseFileReader(spark: SparkSession, + partitionSchema: StructType, + dataSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { val hfileReader = createHFileReader( spark = spark, - tableSchema = tableSchema, + dataSchema = dataSchema, requiredSchema = requiredSchema, filters = filters, options = options, hadoopConf = hadoopConf ) + val parquetReader = HoodieDataSourceHelper.buildHoodieParquetReader( sparkSession = spark, - dataSchema = tableSchema.structTypeSchema, + dataSchema = dataSchema.structTypeSchema, partitionSchema = partitionSchema, requiredSchema = requiredSchema.structTypeSchema, filters = filters, options = options, - hadoopConf = hadoopConf + hadoopConf = hadoopConf, + // We're delegating to Spark to append partition values to every row only in cases + // when these corresponding partition-values are not persisted w/in the data file itself + appendPartitionValues = shouldExtractPartitionValuesFromPartitionPath ) partitionedFile => { val extension = FSUtils.getFileExtension(partitionedFile.filePath) if (HoodieFileFormat.PARQUET.getFileExtension.equals(extension)) { - val iter = parquetReader.apply(partitionedFile) - if (iter.isInstanceOf[Closeable]) { - // register a callback to close parquetReader which will be executed on task completion. - // when tasks finished, this method will be called, and release resources. - Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => iter.asInstanceOf[Closeable].close())) - } - iter + parquetReader.apply(partitionedFile) } else if (HoodieFileFormat.HFILE.getFileExtension.equals(extension)) { hfileReader.apply(partitionedFile) } else { @@ -437,8 +477,38 @@ object HoodieBaseRelation { } } + private def tryPrunePartitionColumns(tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema): (StructType, HoodieTableSchema, HoodieTableSchema) = { + if (shouldExtractPartitionValuesFromPartitionPath) { + val partitionSchema = StructType(partitionColumns.map(StructField(_, StringType))) + val prunedDataStructSchema = prunePartitionColumns(tableSchema.structTypeSchema) + val prunedRequiredSchema = prunePartitionColumns(requiredSchema.structTypeSchema) + + (partitionSchema, + HoodieTableSchema(prunedDataStructSchema, convertToAvroSchema(prunedDataStructSchema).toString), + HoodieTableSchema(prunedRequiredSchema, convertToAvroSchema(prunedRequiredSchema).toString)) + } else { + (StructType(Nil), tableSchema, requiredSchema) + } + } + + private def prunePartitionColumns(dataStructSchema: StructType): StructType = + StructType(dataStructSchema.filterNot(f => partitionColumns.contains(f.name))) +} + +object HoodieBaseRelation extends SparkAdapterSupport { + + private def generateUnsafeProjection(from: StructType, to: StructType) = + sparkAdapter.createCatalystExpressionUtils().generateUnsafeProjection(from, to) + + def convertToAvroSchema(structSchema: StructType): Schema = + sparkAdapter.getAvroSchemaConverters.toAvroType(structSchema, nullable = false, "Record") + + def getPartitionPath(fileStatus: FileStatus): Path = + fileStatus.getPath.getParent + private def createHFileReader(spark: SparkSession, - tableSchema: HoodieTableSchema, + dataSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, filters: Seq[Filter], options: Map[String, String], diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala index 02264bc4a62fb..e430364be9423 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala @@ -21,6 +21,7 @@ package org.apache.hudi import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileStatus import org.apache.hudi.client.utils.SparkInternalSchemaConverter +import org.apache.hudi.common.util.StringUtils.isNullOrEmpty import org.apache.hudi.internal.schema.InternalSchema import org.apache.hudi.internal.schema.utils.SerDeHelper import org.apache.spark.sql.SparkSession @@ -38,8 +39,8 @@ object HoodieDataSourceHelper extends PredicateHelper with SparkAdapterSupport { /** - * Wrapper `buildReaderWithPartitionValues` of [[ParquetFileFormat]] - * to deal with [[ColumnarBatch]] when enable parquet vectorized reader if necessary. + * Wrapper for `buildReaderWithPartitionValues` of [[ParquetFileFormat]] handling [[ColumnarBatch]], + * when Parquet's Vectorized Reader is used */ def buildHoodieParquetReader(sparkSession: SparkSession, dataSchema: StructType, @@ -47,9 +48,10 @@ object HoodieDataSourceHelper extends PredicateHelper with SparkAdapterSupport { requiredSchema: StructType, filters: Seq[Filter], options: Map[String, String], - hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { - - val readParquetFile: PartitionedFile => Iterator[Any] = sparkAdapter.createHoodieParquetFileFormat().get.buildReaderWithPartitionValues( + hadoopConf: Configuration, + appendPartitionValues: Boolean = false): PartitionedFile => Iterator[InternalRow] = { + val parquetFileFormat: ParquetFileFormat = sparkAdapter.createHoodieParquetFileFormat(appendPartitionValues).get + val readParquetFile: PartitionedFile => Iterator[Any] = parquetFileFormat.buildReaderWithPartitionValues( sparkSession = sparkSession, dataSchema = dataSchema, partitionSchema = partitionSchema, @@ -91,9 +93,12 @@ object HoodieDataSourceHelper extends PredicateHelper with SparkAdapterSupport { * @param validCommits valid commits, using give validCommits to validate all legal histroy Schema files, and return the latest one. */ def getConfigurationWithInternalSchema(conf: Configuration, internalSchema: InternalSchema, tablePath: String, validCommits: String): Configuration = { - conf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, SerDeHelper.toJson(internalSchema)) - conf.set(SparkInternalSchemaConverter.HOODIE_TABLE_PATH, tablePath) - conf.set(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST, validCommits) + val querySchemaString = SerDeHelper.toJson(internalSchema) + if (!isNullOrEmpty(querySchemaString)) { + conf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, SerDeHelper.toJson(internalSchema)) + conf.set(SparkInternalSchemaConverter.HOODIE_TABLE_PATH, tablePath) + conf.set(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST, validCommits) + } conf } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index 0ea4d1cef2e04..08d0d722b2f68 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -85,11 +85,6 @@ case class HoodieFileIndex(spark: SparkSession, override def rootPaths: Seq[Path] = queryPaths.asScala - def isDataSkippingEnabled: Boolean = { - options.getOrElse(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), - spark.sessionState.conf.getConfString(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false")).toBoolean - } - /** * Returns the FileStatus for all the base files (excluding log files). This should be used only for * cases where Spark directly fetches the list of files via HoodieFileIndex or for read optimized query logic @@ -196,12 +191,20 @@ case class HoodieFileIndex(spark: SparkSession, * @return list of pruned (data-skipped) candidate base-files' names */ private def lookupCandidateFilesInMetadataTable(queryFilters: Seq[Expression]): Try[Option[Set[String]]] = Try { - if (!isDataSkippingEnabled || queryFilters.isEmpty || !HoodieTableMetadataUtil.getCompletedMetadataPartitions(metaClient.getTableConfig) - .contains(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS)) { + // NOTE: Data Skipping is only effective when it references columns that are indexed w/in + // the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping: + // - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since + // CSI only contains stats for top-level columns, in this case for "struct") + // - Any expression not directly referencing top-level column (for ex, sub-queries, since there's + // nothing CSI in particular could be applied for) + lazy val queryReferencedColumns = collectReferencedColumns(spark, queryFilters, schema) + + if (!isMetadataTableEnabled || !isColumnStatsIndexAvailable || !isDataSkippingEnabled) { + validateConfig() + Option.empty + } else if (queryFilters.isEmpty || queryReferencedColumns.isEmpty) { Option.empty } else { - val queryReferencedColumns = collectReferencedColumns(spark, queryFilters, schema) - val colStatsDF: DataFrame = readColumnStatsIndex(spark, basePath, metadataConfig, queryReferencedColumns) // Persist DF to avoid re-computing column statistics unraveling @@ -245,13 +248,27 @@ case class HoodieFileIndex(spark: SparkSession, override def refresh(): Unit = super.refresh() - override def inputFiles: Array[String] = { - val fileStatusList = allFiles - fileStatusList.map(_.getPath.toString).toArray - } + override def inputFiles: Array[String] = + allFiles.map(_.getPath.toString).toArray - override def sizeInBytes: Long = { - cachedFileSize + override def sizeInBytes: Long = cachedFileSize + + private def isColumnStatsIndexAvailable = + HoodieTableMetadataUtil.getCompletedMetadataPartitions(metaClient.getTableConfig) + .contains(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS) + + private def isDataSkippingEnabled: Boolean = + options.getOrElse(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), + spark.sessionState.conf.getConfString(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false")).toBoolean + + private def isMetadataTableEnabled: Boolean = metadataConfig.enabled() + private def isColumnStatsIndexEnabled: Boolean = metadataConfig.isColumnStatsIndexEnabled + + private def validateConfig(): Unit = { + if (isDataSkippingEnabled && (!isMetadataTableEnabled || !isColumnStatsIndexEnabled)) { + logWarning("Data skipping requires both Metadata Table and Column Stats Index to be enabled as well! " + + s"(isMetadataTableEnabled = ${isMetadataTableEnabled}, isColumnStatsIndexEnabled = ${isColumnStatsIndexEnabled}") + } } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala index c0c47cff427c3..a7ca60865fbba 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala @@ -345,7 +345,7 @@ private object HoodieMergeOnReadRDD { val logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder() .withFileSystem(fs) .withBasePath(tablePath) - .withLogFilePaths(logFiles.map(logFile => getFilePath(logFile.getPath)).asJava) + .withLogFilePaths(logFiles.map(logFile => logFile.getPath.toString).asJava) .withReaderSchema(logSchema) .withLatestInstantTime(tableState.latestCommitTimestamp) .withReadBlocksLazily( diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index c86b1615ba58d..4423874ab8e8c 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -21,7 +21,6 @@ import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.hive.conf.HiveConf import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.HoodieConversionUtils.toProperties import org.apache.hudi.HoodieWriterUtils._ @@ -88,7 +87,7 @@ object HoodieSparkSqlWriter { val (parameters, hoodieConfig) = mergeParamsAndGetHoodieConfig(optParams, tableConfig) val originKeyGeneratorClassName = HoodieWriterUtils.getOriginKeyGenerator(parameters) - val timestampKeyGeneratorConfigs = extractConfigsRelatedToTimestmapBasedKeyGenerator( + val timestampKeyGeneratorConfigs = extractConfigsRelatedToTimestampBasedKeyGenerator( originKeyGeneratorClassName, parameters) //validate datasource and tableconfig keygen are the same validateKeyGeneratorConfig(originKeyGeneratorClassName, tableConfig); @@ -151,7 +150,9 @@ object HoodieSparkSqlWriter { .setBaseFileFormat(baseFileFormat) .setArchiveLogFolder(archiveLogFolder) .setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_NAME)) - .setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD, null)) + // we can't fetch preCombine field from hoodieConfig object, since it falls back to "ts" as default value, + // but we are interested in what user has set, hence fetching from optParams. + .setPreCombineField(optParams.getOrElse(PRECOMBINE_FIELD.key(), null)) .setPartitionFields(partitionColumns) .setPopulateMetaFields(populateMetaFields) .setRecordKeyFields(hoodieConfig.getString(RECORDKEY_FIELD)) @@ -160,7 +161,7 @@ object HoodieSparkSqlWriter { .setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING)) .setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING)) .setPartitionMetafileUseBaseFormat(useBaseFormatMetaFile) - .setDropPartitionColumnsWhenWrite(hoodieConfig.getBooleanOrDefault(HoodieTableConfig.DROP_PARTITION_COLUMNS)) + .setShouldDropPartitionColumns(hoodieConfig.getBooleanOrDefault(HoodieTableConfig.DROP_PARTITION_COLUMNS)) .setCommitTimezone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getStringOrDefault(HoodieTableConfig.TIMELINE_TIMEZONE))) .initTable(sparkContext.hadoopConfiguration, path) tableConfig = tableMetaClient.getTableConfig @@ -619,11 +620,8 @@ object HoodieSparkSqlWriter { properties.put(HoodieSyncConfig.META_SYNC_SPARK_VERSION.key, SPARK_VERSION) properties.put(HoodieSyncConfig.META_SYNC_USE_FILE_LISTING_FROM_METADATA.key, hoodieConfig.getBoolean(HoodieMetadataConfig.ENABLE)) - val hiveConf: HiveConf = new HiveConf() - hiveConf.addResource(fs.getConf) - syncClientToolClassSet.foreach(impl => { - SyncUtilHelpers.runHoodieMetaSync(impl.trim, properties, hiveConf, fs, basePath.toString, baseFileFormat) + SyncUtilHelpers.runHoodieMetaSync(impl.trim, properties, fs.getConf, fs, basePath.toString, baseFileFormat) }) } true @@ -758,7 +756,7 @@ object HoodieSparkSqlWriter { (params, HoodieWriterUtils.convertMapToHoodieConfig(params)) } - private def extractConfigsRelatedToTimestmapBasedKeyGenerator(keyGenerator: String, + private def extractConfigsRelatedToTimestampBasedKeyGenerator(keyGenerator: String, params: Map[String, String]): Map[String, String] = { if (keyGenerator.equals(classOf[TimestampBasedKeyGenerator].getCanonicalName) || keyGenerator.equals(classOf[TimestampBasedAvroKeyGenerator].getCanonicalName)) { diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala index 039dafb596d8d..d9d5812adbe2f 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala @@ -20,8 +20,8 @@ package org.apache.hudi import org.apache.avro.Schema import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieFileFormat, HoodieRecord, HoodieReplaceCommitMetadata} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} -import java.util.stream.Collectors +import java.util.stream.Collectors import org.apache.hadoop.fs.{GlobPattern, Path} import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.client.utils.SparkInternalSchemaConverter @@ -36,6 +36,7 @@ import org.apache.hudi.table.HoodieSparkTable import org.apache.log4j.LogManager import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext} @@ -183,7 +184,7 @@ class IncrementalRelation(val sqlContext: SQLContext, sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_TABLE_PATH, metaClient.getBasePath) sqlContext.sparkContext.hadoopConfiguration.set(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST, validCommits) val formatClassName = metaClient.getTableConfig.getBaseFileFormat match { - case HoodieFileFormat.PARQUET => if (!internalSchema.isEmptySchema) "HoodieParquet" else "parquet" + case HoodieFileFormat.PARQUET => HoodieParquetFileFormat.FILE_FORMAT_ID case HoodieFileFormat.ORC => "orc" } sqlContext.sparkContext.hadoopConfiguration.unset("mapreduce.input.pathFilter.class") diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala index 46e395fc2bfe1..806a5e371df55 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala @@ -19,9 +19,7 @@ package org.apache.hudi import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{GlobPattern, Path} -import org.apache.hudi.HoodieBaseRelation.createBaseFileReader import org.apache.hudi.HoodieConversionUtils.toScalaOption -import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath import org.apache.hudi.common.model.{FileSlice, HoodieRecord} import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} @@ -61,14 +59,14 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext, protected override def composeRDD(fileSplits: Seq[HoodieMergeOnReadFileSplit], partitionSchema: StructType, - tableSchema: HoodieTableSchema, + dataSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, filters: Array[Filter]): HoodieMergeOnReadRDD = { val fullSchemaParquetReader = createBaseFileReader( spark = sqlContext.sparkSession, partitionSchema = partitionSchema, - tableSchema = tableSchema, - requiredSchema = tableSchema, + dataSchema = dataSchema, + requiredSchema = dataSchema, // This file-reader is used to read base file records, subsequently merging them with the records // stored in delta-log files. As such, we have to read _all_ records from the base file, while avoiding // applying any user-defined filtering _before_ we complete combining them w/ delta-log records (to make sure that @@ -86,7 +84,7 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext, val requiredSchemaParquetReader = createBaseFileReader( spark = sqlContext.sparkSession, partitionSchema = partitionSchema, - tableSchema = tableSchema, + dataSchema = dataSchema, requiredSchema = requiredSchema, filters = filters ++ incrementalSpanRecordFilters, options = optParams, @@ -99,7 +97,7 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext, // TODO(HUDI-3639) implement incremental span record filtering w/in RDD to make sure returned iterator is appropriately // filtered, since file-reader might not be capable to perform filtering new HoodieMergeOnReadRDD(sqlContext.sparkContext, jobConf, fullSchemaParquetReader, requiredSchemaParquetReader, - tableSchema, requiredSchema, hoodieTableState, mergeType, fileSplits) + dataSchema, requiredSchema, hoodieTableState, mergeType, fileSplits) } override protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = { @@ -155,7 +153,7 @@ trait HoodieIncrementalRelationTrait extends HoodieBaseRelation { Seq(isNotNullFilter, largerThanFilter, lessThanFilter) } - override lazy val mandatoryColumns: Seq[String] = { + override lazy val mandatoryFields: Seq[String] = { // NOTE: This columns are required for Incremental flow to be able to handle the rows properly, even in // cases when no columns are requested to be fetched (for ex, when using {@code count()} API) Seq(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD) ++ diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala index d85788e25b303..75bc96624e7b0 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala @@ -20,17 +20,14 @@ package org.apache.hudi import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import org.apache.hudi.HoodieBaseRelation.createBaseFileReader import org.apache.hudi.HoodieConversionUtils.toScalaOption import org.apache.hudi.MergeOnReadSnapshotRelation.getFilePath import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath import org.apache.hudi.common.model.{FileSlice, HoodieLogFile} import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.view.HoodieTableFileSystemView -import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.sources.Filter @@ -50,7 +47,7 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext, override type FileSplit = HoodieMergeOnReadFileSplit - override lazy val mandatoryColumns: Seq[String] = + override lazy val mandatoryFields: Seq[String] = Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq()) protected val mergeType: String = optParams.getOrElse(DataSourceReadOptions.REALTIME_MERGE.key, @@ -63,14 +60,14 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext, protected override def composeRDD(fileSplits: Seq[HoodieMergeOnReadFileSplit], partitionSchema: StructType, - tableSchema: HoodieTableSchema, + dataSchema: HoodieTableSchema, requiredSchema: HoodieTableSchema, filters: Array[Filter]): HoodieMergeOnReadRDD = { val fullSchemaParquetReader = createBaseFileReader( spark = sqlContext.sparkSession, partitionSchema = partitionSchema, - tableSchema = tableSchema, - requiredSchema = tableSchema, + dataSchema = dataSchema, + requiredSchema = dataSchema, // This file-reader is used to read base file records, subsequently merging them with the records // stored in delta-log files. As such, we have to read _all_ records from the base file, while avoiding // applying any filtering _before_ we complete combining them w/ delta-log records (to make sure that @@ -85,7 +82,7 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext, val requiredSchemaParquetReader = createBaseFileReader( spark = sqlContext.sparkSession, partitionSchema = partitionSchema, - tableSchema = tableSchema, + dataSchema = dataSchema, requiredSchema = requiredSchema, filters = filters, options = optParams, @@ -96,7 +93,7 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext, val tableState = getTableState new HoodieMergeOnReadRDD(sqlContext.sparkContext, jobConf, fullSchemaParquetReader, requiredSchemaParquetReader, - tableSchema, requiredSchema, tableState, mergeType, fileSplits) + dataSchema, requiredSchema, tableState, mergeType, fileSplits) } protected override def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = { diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala index 1305323bd1a28..cd1c1fb4affc4 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala @@ -120,6 +120,9 @@ class SparkHoodieTableFileIndex(spark: SparkSession, StructType(schema.fields.filterNot(f => partitionColumns.contains(f.name))) } + /** + * @VisibleForTesting + */ def partitionSchema: StructType = { if (queryAsNonePartitionedTable) { // If we read it as Non-Partitioned table, we should not diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkHoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormat.scala similarity index 50% rename from hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkHoodieParquetFileFormat.scala rename to hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormat.scala index 150178ea69066..a52e9335fe374 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkHoodieParquetFileFormat.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieParquetFileFormat.scala @@ -19,30 +19,38 @@ package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration -import org.apache.hudi.SparkAdapterSupport +import org.apache.hudi.{DataSourceReadOptions, SparkAdapterSupport} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.datasources.parquet.HoodieParquetFileFormat.FILE_FORMAT_ID import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType -class SparkHoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport { - override def shortName(): String = "HoodieParquet" +class HoodieParquetFileFormat extends ParquetFileFormat with SparkAdapterSupport { - override def toString: String = "HoodieParquet" + override def shortName(): String = FILE_FORMAT_ID + + override def toString: String = "Hoodie-Parquet" + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + val shouldExtractPartitionValuesFromPartitionPath = + options.getOrElse(DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key, + DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.defaultValue.toString).toBoolean - override def buildReaderWithPartitionValues( - sparkSession: SparkSession, - dataSchema: StructType, - partitionSchema: StructType, - requiredSchema: StructType, - filters: Seq[Filter], - options: Map[String, String], - hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { sparkAdapter - .createHoodieParquetFileFormat().get + .createHoodieParquetFileFormat(shouldExtractPartitionValuesFromPartitionPath).get .buildReaderWithPartitionValues(sparkSession, dataSchema, partitionSchema, requiredSchema, filters, options, hadoopConf) } } +object HoodieParquetFileFormat { + val FILE_FORMAT_ID = "hoodie-parquet" +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala index fcdbacea51e43..fa01ba37e9d7a 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala @@ -29,13 +29,12 @@ import org.apache.hudi.common.util.PartitionPathEncodeUtils import org.apache.hudi.{AvroConversionUtils, SparkAdapterSupport} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedRelation} -import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, HoodieCatalogTable} +import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable} import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Cast, Expression, Literal} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} -import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} -import org.apache.spark.sql.types.{DataType, NullType, StringType, StructField, StructType} +import org.apache.spark.sql.types._ import org.apache.spark.sql.{AnalysisException, Column, DataFrame, SparkSession} import java.net.URI diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala index c7afbfe11f998..628f383b6903c 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableDropPartitionCommand.scala @@ -18,8 +18,6 @@ package org.apache.spark.sql.hudi.command import org.apache.hudi.HoodieSparkSqlWriter -import org.apache.hudi.client.common.HoodieSparkEngineContext -import org.apache.hudi.common.fs.FSUtils import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable @@ -29,11 +27,11 @@ import org.apache.spark.sql.hudi.ProvidesHoodieConfig import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} case class AlterHoodieTableDropPartitionCommand( - tableIdentifier: TableIdentifier, - specs: Seq[TablePartitionSpec], - ifExists : Boolean, - purge : Boolean, - retainData : Boolean) + tableIdentifier: TableIdentifier, + partitionSpecs: Seq[TablePartitionSpec], + ifExists : Boolean, + purge : Boolean, + retainData : Boolean) extends HoodieLeafRunnableCommand with ProvidesHoodieConfig { override def run(sparkSession: SparkSession): Seq[Row] = { @@ -49,7 +47,7 @@ case class AlterHoodieTableDropPartitionCommand( DDLUtils.verifyAlterTableType( sparkSession.sessionState.catalog, hoodieCatalogTable.table, isView = false) - val normalizedSpecs: Seq[Map[String, String]] = specs.map { spec => + val normalizedSpecs: Seq[Map[String, String]] = partitionSpecs.map { spec => normalizePartitionSpec( spec, hoodieCatalogTable.partitionFields, @@ -57,6 +55,8 @@ case class AlterHoodieTableDropPartitionCommand( sparkSession.sessionState.conf.resolver) } + // drop partitions to lazy clean (https://github.com/apache/hudi/pull/4489) + // delete partition files by enabling cleaner and setting retention policies. val partitionsToDrop = getPartitionPathToDrop(hoodieCatalogTable, normalizedSpecs) val parameters = buildHoodieDropPartitionsConfig(sparkSession, hoodieCatalogTable, partitionsToDrop) HoodieSparkSqlWriter.write( @@ -65,17 +65,6 @@ case class AlterHoodieTableDropPartitionCommand( parameters, sparkSession.emptyDataFrame) - - // Recursively delete partition directories - if (purge) { - val engineContext = new HoodieSparkEngineContext(sparkSession.sparkContext) - val basePath = hoodieCatalogTable.tableLocation - val fullPartitionPath = FSUtils.getPartitionPath(basePath, partitionsToDrop) - logInfo("Clean partition up " + fullPartitionPath) - val fs = FSUtils.getFs(basePath, sparkSession.sparkContext.hadoopConfiguration) - FSUtils.deleteDir(engineContext, fs, fullPartitionPath, sparkSession.sparkContext.defaultParallelism) - } - sparkSession.catalog.refreshTable(tableIdentifier.unquotedString) logInfo(s"Finish execute alter table drop partition command for $fullTableName") Seq.empty[Row] diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala index c7b5bdc202f65..094106c8d06d8 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableRenameCommand.scala @@ -27,11 +27,11 @@ import org.apache.spark.sql.execution.command.AlterTableRenameCommand /** * Command for alter hudi table's table name. */ -class AlterHoodieTableRenameCommand( +case class AlterHoodieTableRenameCommand( oldName: TableIdentifier, newName: TableIdentifier, isView: Boolean) - extends AlterTableRenameCommand(oldName, newName, isView) { + extends HoodieLeafRunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { if (newName != oldName) { @@ -45,7 +45,7 @@ class AlterHoodieTableRenameCommand( .initTable(hadoopConf, hoodieCatalogTable.tableLocation) // Call AlterTableRenameCommand#run to rename table in meta. - super.run(sparkSession) + AlterTableRenameCommand(oldName, newName, isView).run(sparkSession) } Seq.empty[Row] } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala index 04936978ed1de..f5349ee5feed4 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/TruncateHoodieTableCommand.scala @@ -18,15 +18,16 @@ package org.apache.spark.sql.hudi.command import org.apache.hadoop.fs.Path +import org.apache.hudi.HoodieSparkSqlWriter +import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec -import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTableType, HoodieCatalogTable} +import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, HoodieCatalogTable} import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getPartitionPathToDrop, normalizePartitionSpec} -import org.apache.spark.sql.{AnalysisException, Row, SparkSession} - -import scala.util.control.NonFatal +import org.apache.spark.sql.hudi.ProvidesHoodieConfig +import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} /** * Command for truncate hudi table. @@ -34,99 +35,67 @@ import scala.util.control.NonFatal case class TruncateHoodieTableCommand( tableIdentifier: TableIdentifier, partitionSpec: Option[TablePartitionSpec]) - extends HoodieLeafRunnableCommand { + extends HoodieLeafRunnableCommand with ProvidesHoodieConfig { - override def run(spark: SparkSession): Seq[Row] = { + override def run(sparkSession: SparkSession): Seq[Row] = { val fullTableName = s"${tableIdentifier.database}.${tableIdentifier.table}" logInfo(s"start execute truncate table command for $fullTableName") - val hoodieCatalogTable = HoodieCatalogTable(spark, tableIdentifier) - val properties = hoodieCatalogTable.tableConfig.getProps - - try { - // Delete all data in the table directory - val catalog = spark.sessionState.catalog - val table = catalog.getTableMetadata(tableIdentifier) - val tableIdentWithDB = table.identifier.quotedString - - if (table.tableType == CatalogTableType.VIEW) { - throw new AnalysisException( - s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentWithDB") - } - - if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) { - throw new AnalysisException( - s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " + - s"for tables that are not partitioned: $tableIdentWithDB") - } - - val basePath = hoodieCatalogTable.tableLocation - val partCols = table.partitionColumnNames - val locations = if (partitionSpec.isEmpty || partCols.isEmpty) { - Seq(basePath) - } else { - val normalizedSpec: Seq[Map[String, String]] = Seq(partitionSpec.map { spec => - normalizePartitionSpec( - spec, - partCols, - table.identifier.quotedString, - spark.sessionState.conf.resolver) - }.get) + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableIdentifier) - val fullPartitionPath = FSUtils.getPartitionPath(basePath, getPartitionPathToDrop(hoodieCatalogTable, normalizedSpec)) + val catalog = sparkSession.sessionState.catalog + val table = catalog.getTableMetadata(tableIdentifier) + val tableId = table.identifier.quotedString - Seq(fullPartitionPath) - } - - val hadoopConf = spark.sessionState.newHadoopConf() - locations.foreach { location => - val path = new Path(location.toString) - try { - val fs = path.getFileSystem(hadoopConf) - fs.delete(path, true) - fs.mkdirs(path) - } catch { - case NonFatal(e) => - throw new AnalysisException( - s"Failed to truncate table $tableIdentWithDB when removing data of the path: $path " + - s"because of ${e.toString}") - } - } - - // Also try to drop the contents of the table from the columnar cache - try { - spark.sharedState.cacheManager.uncacheQuery(spark.table(table.identifier), cascade = true) - } catch { - case NonFatal(_) => - } + if (table.tableType == CatalogTableType.VIEW) { + throw new AnalysisException( + s"Operation not allowed: TRUNCATE TABLE on views: $tableId") + } - if (table.stats.nonEmpty) { - // empty table after truncation - val newStats = CatalogStatistics(sizeInBytes = 0, rowCount = Some(0)) - catalog.alterTableStats(tableIdentifier, Some(newStats)) - } - Seq.empty[Row] - } catch { - // TruncateTableCommand will delete the related directories first, and then refresh the table. - // It will fail when refresh table, because the hudi meta directory(.hoodie) has been deleted at the first step. - // So here ignore this failure, and refresh table later. - case NonFatal(e) => - throw new AnalysisException(s"Exception when attempting to truncate table ${tableIdentifier.quotedString}: " + e) + if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) { + throw new AnalysisException( + s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " + + s"for tables that are not partitioned: $tableId") } + val basePath = hoodieCatalogTable.tableLocation + val properties = hoodieCatalogTable.tableConfig.getProps + val hadoopConf = sparkSession.sessionState.newHadoopConf() + // If we have not specified the partition, truncate will delete all the data in the table path - // include the hoodie.properties. In this case we should reInit the table. if (partitionSpec.isEmpty) { - val hadoopConf = spark.sessionState.newHadoopConf() + val targetPath = new Path(basePath) + val engineContext = new HoodieSparkEngineContext(sparkSession.sparkContext) + val fs = FSUtils.getFs(basePath, sparkSession.sparkContext.hadoopConfiguration) + FSUtils.deleteDir(engineContext, fs, targetPath, sparkSession.sparkContext.defaultParallelism) + // ReInit hoodie.properties HoodieTableMetaClient.withPropertyBuilder() .fromProperties(properties) .initTable(hadoopConf, hoodieCatalogTable.tableLocation) + } else { + val normalizedSpecs: Seq[Map[String, String]] = Seq(partitionSpec.map { spec => + normalizePartitionSpec( + spec, + hoodieCatalogTable.partitionFields, + hoodieCatalogTable.tableName, + sparkSession.sessionState.conf.resolver) + }.get) + + // drop partitions to lazy clean + val partitionsToDrop = getPartitionPathToDrop(hoodieCatalogTable, normalizedSpecs) + val parameters = buildHoodieDropPartitionsConfig(sparkSession, hoodieCatalogTable, partitionsToDrop) + HoodieSparkSqlWriter.write( + sparkSession.sqlContext, + SaveMode.Append, + parameters, + sparkSession.emptyDataFrame) } // After deleting the data, refresh the table to make sure we don't keep around a stale // file relation in the metastore cache and cached table data in the cache manager. - spark.catalog.refreshTable(hoodieCatalogTable.table.identifier.quotedString) + sparkSession.catalog.refreshTable(table.identifier.quotedString) + logInfo(s"Finish execute truncate table command for $fullTableName") Seq.empty[Row] } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala index b140b6767e1e4..dcacbef3a26fa 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala @@ -526,7 +526,7 @@ case class HoodiePostAnalysisRule(sparkSession: SparkSession) extends Rule[Logic // Rewrite the AlterTableRenameCommand to AlterHoodieTableRenameCommand case AlterTableRenameCommand(oldName, newName, isView) if !isView && sparkAdapter.isHoodieTable(oldName, sparkSession) => - new AlterHoodieTableRenameCommand(oldName, newName, isView) + AlterHoodieTableRenameCommand(oldName, newName, isView) // Rewrite the AlterTableChangeColumnCommand to AlterHoodieTableChangeColumnCommand case AlterTableChangeColumnCommand(tableName, columnName, newColumn) if sparkAdapter.isHoodieTable(tableName, sparkSession) => @@ -539,7 +539,7 @@ case class HoodiePostAnalysisRule(sparkSession: SparkSession) extends Rule[Logic // Rewrite TruncateTableCommand to TruncateHoodieTableCommand case TruncateTableCommand(tableName, partitionSpec) if sparkAdapter.isHoodieTable(tableName, sparkSession) => - new TruncateHoodieTableCommand(tableName, partitionSpec) + TruncateHoodieTableCommand(tableName, partitionSpec) case _ => plan } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/column-stats-index-table.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/column-stats-index-table.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/column-stats-index-table.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00000-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/input-table-json/part-00000-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00000-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/input-table-json/part-00000-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00001-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/input-table-json/part-00001-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00001-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/input-table-json/part-00001-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00002-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/input-table-json/part-00002-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00002-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/input-table-json/part-00002-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00003-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/input-table-json/part-00003-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00003-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/input-table-json/part-00003-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..9c0daac405731 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":770,"c4":"2021-11-18T23:34:44.201-08:00","c5":78,"c6":"2020-01-15","c7":"Ag==","c8":9} +{"c1":768,"c4":"2021-11-18T23:34:44.201-08:00","c5":78,"c6":"2020-10-13","c7":"AA==","c8":9} +{"c1":431,"c4":"2021-11-18T23:34:44.186-08:00","c5":44,"c6":"2020-03-12","c7":"rw==","c8":9} +{"c1":427,"c4":"2021-11-18T23:34:44.186-08:00","c5":44,"c6":"2020-10-08","c7":"qw==","c8":9} +{"c1":328,"c4":"2021-11-18T23:34:44.181-08:00","c5":34,"c6":"2020-10-21","c7":"SA==","c8":9} +{"c1":320,"c4":"2021-11-18T23:34:44.180-08:00","c5":33,"c6":"2020-02-13","c7":"QA==","c8":9} +{"c1":317,"c4":"2021-11-18T23:34:44.180-08:00","c5":33,"c6":"2020-10-10","c7":"PQ==","c8":9} +{"c1":308,"c4":"2021-11-18T23:34:44.180-08:00","c5":32,"c6":"2020-01-01","c7":"NA==","c8":9} +{"c1":304,"c4":"2021-11-18T23:34:44.179-08:00","c5":32,"c6":"2020-08-25","c7":"MA==","c8":9} +{"c1":300,"c4":"2021-11-18T23:34:44.179-08:00","c5":31,"c6":"2020-04-21","c7":"LA==","c8":9} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..d19386382bede --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":719,"c4":"2021-11-18T23:34:44.199-08:00","c5":73,"c6":"2020-05-20","c7":"zw==","c8":9} +{"c1":715,"c4":"2021-11-18T23:34:44.199-08:00","c5":73,"c6":"2020-01-16","c7":"yw==","c8":9} +{"c1":579,"c4":"2021-11-18T23:34:44.193-08:00","c5":59,"c6":"2020-08-20","c7":"Qw==","c8":9} +{"c1":568,"c4":"2021-11-18T23:34:44.193-08:00","c5":58,"c6":"2020-08-09","c7":"OA==","c8":9} +{"c1":367,"c4":"2021-11-18T23:34:44.183-08:00","c5":38,"c6":"2020-05-04","c7":"bw==","c8":9} +{"c1":364,"c4":"2021-11-18T23:34:44.183-08:00","c5":38,"c6":"2020-02-01","c7":"bA==","c8":9} +{"c1":250,"c4":"2021-11-18T23:34:44.176-08:00","c5":26,"c6":"2020-09-27","c7":"+g==","c8":9} +{"c1":249,"c4":"2021-11-18T23:34:44.176-08:00","c5":26,"c6":"2020-08-26","c7":"+Q==","c8":9} +{"c1":246,"c4":"2021-11-18T23:34:44.176-08:00","c5":26,"c6":"2020-05-23","c7":"9g==","c8":9} +{"c1":125,"c4":"2021-11-18T23:34:44.169-08:00","c5":14,"c6":"2020-05-14","c7":"fQ==","c8":9} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..602dbe87b1286 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":486,"c4":"2021-11-18T23:34:44.189-08:00","c5":50,"c6":"2020-03-11","c7":"5g==","c8":9} +{"c1":483,"c4":"2021-11-18T23:34:44.189-08:00","c5":49,"c6":"2020-11-08","c7":"4w==","c8":9} +{"c1":224,"c4":"2021-11-18T23:34:44.175-08:00","c5":24,"c6":"2020-05-01","c7":"4A==","c8":9} +{"c1":118,"c4":"2021-11-18T23:34:44.168-08:00","c5":13,"c6":"2020-09-07","c7":"dg==","c8":9} +{"c1":111,"c4":"2021-11-18T23:34:44.168-08:00","c5":12,"c6":"2020-02-28","c7":"bw==","c8":9} +{"c1":79,"c4":"2021-11-18T23:34:44.166-08:00","c5":9,"c6":"2020-03-24","c7":"Tw==","c8":9} +{"c1":77,"c4":"2021-11-18T23:34:44.166-08:00","c5":9,"c6":"2020-01-22","c7":"TQ==","c8":9} +{"c1":76,"c4":"2021-11-18T23:34:44.166-08:00","c5":9,"c6":"2020-11-21","c7":"TA==","c8":9} +{"c1":60,"c4":"2021-11-18T23:34:44.164-08:00","c5":7,"c6":"2020-06-05","c7":"PA==","c8":9} +{"c1":59,"c4":"2021-11-18T23:34:44.164-08:00","c5":7,"c6":"2020-05-04","c7":"Ow==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..6232e862f92c4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":272,"c4":"2021-11-18T23:34:44.178-08:00","c5":28,"c6":"2020-09-21","c7":"EA==","c8":9} +{"c1":258,"c4":"2021-11-18T23:34:44.177-08:00","c5":27,"c6":"2020-06-07","c7":"Ag==","c8":9} +{"c1":240,"c4":"2021-11-18T23:34:44.176-08:00","c5":25,"c6":"2020-10-17","c7":"8A==","c8":9} +{"c1":236,"c4":"2021-11-18T23:34:44.176-08:00","c5":25,"c6":"2020-06-13","c7":"7A==","c8":9} +{"c1":137,"c4":"2021-11-18T23:34:44.170-08:00","c5":15,"c6":"2020-06-26","c7":"iQ==","c8":9} +{"c1":134,"c4":"2021-11-18T23:34:44.170-08:00","c5":15,"c6":"2020-03-23","c7":"hg==","c8":9} +{"c1":131,"c4":"2021-11-18T23:34:44.169-08:00","c5":14,"c6":"2020-11-20","c7":"gw==","c8":9} +{"c1":129,"c4":"2021-11-18T23:34:44.169-08:00","c5":14,"c6":"2020-09-18","c7":"gQ==","c8":9} +{"c1":24,"c4":"2021-11-18T23:34:44.161-08:00","c5":4,"c6":"2020-03-25","c7":"GA==","c8":9} +{"c1":8,"c4":"2021-11-18T23:34:44.159-08:00","c5":2,"c6":"2020-09-09","c7":"CA==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-column-stats-index-table.json new file mode 100644 index 0000000000000..8405cdf91fc9b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/partial-column-stats-index-table.json @@ -0,0 +1,4 @@ +{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"valueCount":9} +{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"valueCount":8} +{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"valueCount":10} +{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"valueCount":13} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/updated-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/updated-column-stats-index-table.json similarity index 100% rename from hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/updated-column-stats-index-table.json rename to hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/updated-column-stats-index-table.json diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/updated-partial-column-stats-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/updated-partial-column-stats-index-table.json new file mode 100644 index 0000000000000..8552fd3592cda --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/colstats/updated-partial-column-stats-index-table.json @@ -0,0 +1,8 @@ +{"c1_maxValue":568,"c1_minValue":8,"c1_nullCount":0,"c2_nullCount":15,"c3_nullCount":15,"valueCount":15} +{"c1_maxValue":715,"c1_minValue":76,"c1_nullCount":0,"c2_nullCount":12,"c3_nullCount":12,"valueCount":12} +{"c1_maxValue":768,"c1_minValue":59,"c1_nullCount":0,"c2_nullCount":7,"c3_nullCount":7,"valueCount":7} +{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_nullCount":0,"c3_maxValue":919.769,"c3_minValue":76.430,"c3_nullCount":0,"valueCount":9} +{"c1_maxValue":770,"c1_minValue":129,"c1_nullCount":0,"c2_nullCount":6,"c3_nullCount":6,"valueCount":6} +{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_nullCount":0,"c3_maxValue":994.355,"c3_minValue":19.000,"c3_nullCount":0,"valueCount":8} +{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_nullCount":0,"c3_maxValue":854.690,"c3_minValue":100.556,"c3_nullCount":0,"valueCount":10} +{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_nullCount":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_nullCount":0,"valueCount":13} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/log4j-surefire.properties b/hudi-spark-datasource/hudi-spark/src/test/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/resources/log4j-surefire.properties +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/SparkDatasetMixin.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/SparkDatasetMixin.scala new file mode 100644 index 0000000000000..ee733a86a697e --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/SparkDatasetMixin.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.generic.GenericRecord +import org.apache.hudi.common.model.{HoodieRecord, HoodieRecordPayload} +import org.apache.hudi.common.testutils.HoodieTestDataGenerator +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession + +import scala.collection.JavaConversions.collectionAsScalaIterable + +trait SparkDatasetMixin { + + def toDataset(spark: SparkSession, records: java.util.List[HoodieRecord[_]]) = { + val avroRecords = records.map( + _.getData + .asInstanceOf[HoodieRecordPayload[_]] + .getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA) + .get + .asInstanceOf[GenericRecord] + ) + .toSeq + val rdd: RDD[GenericRecord] = spark.sparkContext.parallelize(avroRecords) + AvroConversionUtils.createDataFrame(rdd, HoodieTestDataGenerator.AVRO_SCHEMA.toString, spark) + } + +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala index feed6fd334062..1d4dbfb1eace7 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala @@ -25,10 +25,9 @@ import org.apache.hudi.client.HoodieJavaWriteClient import org.apache.hudi.client.common.HoodieJavaEngineContext import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.engine.EngineType -import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.model.{HoodieRecord, HoodieTableQueryType, HoodieTableType} -import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType} import org.apache.hudi.common.table.view.HoodieTableFileSystemView +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestUtils} @@ -38,17 +37,15 @@ import org.apache.hudi.config.{HoodieStorageConfig, HoodieWriteConfig} import org.apache.hudi.keygen.ComplexKeyGenerator import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.TimestampType import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config -import org.apache.hudi.metadata.{HoodieTableMetadata, MetadataPartitionType} -import org.apache.hudi.testutils.{HoodieClientTestBase, SparkClientFunctionalTestHarness} +import org.apache.hudi.testutils.HoodieClientTestBase import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EqualTo, GreaterThanOrEqual, LessThan, Literal} import org.apache.spark.sql.execution.datasources.{NoopCache, PartitionDirectory} import org.apache.spark.sql.functions.{lit, struct} import org.apache.spark.sql.types.{IntegerType, StringType} import org.apache.spark.sql.{DataFrameWriter, Row, SaveMode, SparkSession} import org.junit.jupiter.api.Assertions.assertEquals -import org.junit.jupiter.api.{BeforeEach, Tag, Test} +import org.junit.jupiter.api.{BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{Arguments, CsvSource, MethodSource, ValueSource} @@ -343,16 +340,19 @@ class TestHoodieFileIndex extends HoodieClientTestBase { import _spark.implicits._ val inputDF = tuples.toDF("id", "inv_id", "str", "rand") + val writeMetadataOpts = Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" + ) + val opts = Map( "hoodie.insert.shuffle.parallelism" -> "4", "hoodie.upsert.shuffle.parallelism" -> "4", HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", RECORDKEY_FIELD.key -> "id", PRECOMBINE_FIELD.key -> "id", - HoodieMetadataConfig.ENABLE.key -> "true", - HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true", HoodieTableConfig.POPULATE_META_FIELDS.key -> "true" - ) + ) ++ writeMetadataOpts // If there are any failures in the Data Skipping flow, test should fail spark.sqlContext.setConf(DataSkippingFailureMode.configName, DataSkippingFailureMode.Strict.value); @@ -368,26 +368,46 @@ class TestHoodieFileIndex extends HoodieClientTestBase { metaClient = HoodieTableMetaClient.reload(metaClient) - val props = Map[String, String]( - "path" -> basePath, - QUERY_TYPE.key -> QUERY_TYPE_SNAPSHOT_OPT_VAL, - DataSourceReadOptions.ENABLE_DATA_SKIPPING.key -> "true", - // NOTE: Metadata Table has to be enabled on the read path as well - HoodieMetadataConfig.ENABLE.key -> "true", - HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" - ) - - val fileIndex = HoodieFileIndex(spark, metaClient, Option.empty, props, NoopCache) - - val allFilesPartitions = fileIndex.listFiles(Seq(), Seq()) - assertEquals(10, allFilesPartitions.head.files.length) - - // We're selecting a single file that contains "id" == 1 row, which there should be - // strictly 1. Given that 1 is minimal possible value, Data Skipping should be able to - // truncate search space to just a single file - val dataFilter = EqualTo(AttributeReference("id", IntegerType, nullable = false)(), Literal(1)) - val filteredPartitions = fileIndex.listFiles(Seq(), Seq(dataFilter)) - assertEquals(1, filteredPartitions.head.files.length) + case class TestCase(enableMetadata: Boolean, + enableColumnStats: Boolean, + enableDataSkipping: Boolean) + + val testCases: Seq[TestCase] = + TestCase(enableMetadata = false, enableColumnStats = false, enableDataSkipping = false) :: + TestCase(enableMetadata = false, enableColumnStats = false, enableDataSkipping = true) :: + TestCase(enableMetadata = true, enableColumnStats = false, enableDataSkipping = true) :: + TestCase(enableMetadata = false, enableColumnStats = true, enableDataSkipping = true) :: + TestCase(enableMetadata = true, enableColumnStats = true, enableDataSkipping = true) :: + Nil + + for (testCase <- testCases) { + val readMetadataOpts = Map( + // NOTE: Metadata Table has to be enabled on the read path as well + HoodieMetadataConfig.ENABLE.key -> testCase.enableMetadata.toString, + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> testCase.enableColumnStats.toString, + HoodieTableConfig.POPULATE_META_FIELDS.key -> "true" + ) + + val props = Map[String, String]( + "path" -> basePath, + QUERY_TYPE.key -> QUERY_TYPE_SNAPSHOT_OPT_VAL, + DataSourceReadOptions.ENABLE_DATA_SKIPPING.key -> testCase.enableDataSkipping.toString + ) ++ readMetadataOpts + + val fileIndex = HoodieFileIndex(spark, metaClient, Option.empty, props, NoopCache) + + val allFilesPartitions = fileIndex.listFiles(Seq(), Seq()) + assertEquals(10, allFilesPartitions.head.files.length) + + if (testCase.enableDataSkipping && testCase.enableMetadata) { + // We're selecting a single file that contains "id" == 1 row, which there should be + // strictly 1. Given that 1 is minimal possible value, Data Skipping should be able to + // truncate search space to just a single file + val dataFilter = EqualTo(AttributeReference("id", IntegerType, nullable = false)(), Literal(1)) + val filteredPartitions = fileIndex.listFiles(Seq(), Seq(dataFilter)) + assertEquals(1, filteredPartitions.head.files.length) + } + } } private def attribute(partition: String): AttributeReference = { @@ -411,6 +431,7 @@ class TestHoodieFileIndex extends HoodieClientTestBase { } object TestHoodieFileIndex { + def keyGeneratorParameters(): java.util.stream.Stream[Arguments] = { java.util.stream.Stream.of( Arguments.arguments(null.asInstanceOf[String]), diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala index 000004ace9ad4..088ec1faabf73 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala @@ -37,7 +37,7 @@ import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat import org.junit.jupiter.api.Assertions.{assertEquals, assertThrows, assertTrue, fail} import org.junit.jupiter.api.function.Executable -import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} +import org.junit.jupiter.api.{AfterEach, BeforeEach, Disabled, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{CsvSource, ValueSource} @@ -96,6 +96,26 @@ class TestCOWDataSource extends HoodieClientTestBase { assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000")) } + @Test def testNoPrecombine() { + // Insert Operation + val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList + val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) + + val commonOptsNoPreCombine = Map( + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4", + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" + ) + inputDF.write.format("hudi") + .options(commonOptsNoPreCombine) + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .mode(SaveMode.Overwrite) + .save(basePath) + + spark.read.format("org.apache.hudi").load(basePath).count() + } @Test def testHoodieIsDeletedNonBooleanField() { // Insert Operation @@ -747,8 +767,9 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals(resultSchema, schema1) } - @ParameterizedTest @ValueSource(booleans = Array(true, false)) - def testCopyOnWriteWithDropPartitionColumns(enableDropPartitionColumns: Boolean) { + @ParameterizedTest + @ValueSource(booleans = Array(true, false)) + def testCopyOnWriteWithDroppedPartitionColumns(enableDropPartitionColumns: Boolean) { val records1 = recordsToStrings(dataGen.generateInsertsContainsAllPartitions("000", 100)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") @@ -897,8 +918,9 @@ class TestCOWDataSource extends HoodieClientTestBase { readResult.sort("_row_key").select("shortDecimal").collect().map(_.getDecimal(0).toPlainString).mkString(",")) } - @Test - def testHoodieBaseFileOnlyViewRelation(): Unit = { + @ParameterizedTest + @ValueSource(booleans = Array(true, false)) + def testPartitionColumnsProperHandling(useGlobbing: Boolean): Unit = { val _spark = spark import _spark.implicits._ @@ -918,26 +940,56 @@ class TestCOWDataSource extends HoodieClientTestBase { .option(DataSourceWriteOptions.PRECOMBINE_FIELD.key, "ts") .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, "org.apache.hudi.keygen.TimestampBasedKeyGenerator") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "DATE_STRING") + .option(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "yyyy-MM-dd") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyy/MM/dd") .option(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT+8:00") - .option(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "yyyy-MM-dd") .mode(org.apache.spark.sql.SaveMode.Append) .save(basePath) - val res = spark.read.format("hudi").load(basePath) + // NOTE: We're testing here that both paths are appropriately handling + // partition values, regardless of whether we're reading the table + // t/h a globbed path or not + val path = if (useGlobbing) { + s"$basePath/*/*/*/*" + } else { + basePath + } + + // Case #1: Partition columns are read from the data file + val firstDF = spark.read.format("hudi").load(path) - assert(res.count() == 2) + assert(firstDF.count() == 2) // data_date is the partition field. Persist to the parquet file using the origin values, and read it. - assertTrue( - res.select("data_date").map(_.get(0).toString).collect().sorted.sameElements( - Array("2018-09-23", "2018-09-24") - ) + assertEquals( + Seq("2018-09-23", "2018-09-24"), + firstDF.select("data_date").map(_.get(0).toString).collect().sorted.toSeq ) - assertTrue( - res.select("_hoodie_partition_path").map(_.get(0).toString).collect().sorted.sameElements( - Array("2018/09/23", "2018/09/24") - ) + assertEquals( + Seq("2018/09/23", "2018/09/24"), + firstDF.select("_hoodie_partition_path").map(_.get(0).toString).collect().sorted.toSeq ) + + // Case #2: Partition columns are extracted from the partition path + // + // NOTE: This case is only relevant when globbing is NOT used, since when globbing is used Spark + // won't be able to infer partitioning properly + if (!useGlobbing) { + val secondDF = spark.read.format("hudi") + .option(DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key, "true") + .load(path) + + assert(secondDF.count() == 2) + + // data_date is the partition field. Persist to the parquet file using the origin values, and read it. + assertEquals( + Seq("2018/09/23", "2018/09/24"), + secondDF.select("data_date").map(_.get(0).toString).collect().sorted.toSeq + ) + assertEquals( + Seq("2018/09/23", "2018/09/24"), + secondDF.select("_hoodie_partition_path").map(_.get(0).toString).collect().sorted.toSeq + ) + } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSourceStorage.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSourceStorage.scala index e7daf08d1193c..48bb46f81b1b0 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSourceStorage.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSourceStorage.scala @@ -33,7 +33,7 @@ import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDat import org.apache.spark.sql._ import org.apache.spark.sql.functions.{col, lit} import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertTrue} -import org.junit.jupiter.api.Tag +import org.junit.jupiter.api.{Disabled, Tag} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{CsvSource, ValueSource} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index e3cde53951077..75d3ce0b71287 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -19,7 +19,7 @@ package org.apache.hudi.functional import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} +import org.apache.hadoop.fs.{LocatedFileStatus, Path} import org.apache.hudi.ColumnStatsIndexSupport.composeIndexSchema import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD} import org.apache.hudi.HoodieConversionUtils.toProperties @@ -27,6 +27,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.ParquetUtils import org.apache.hudi.config.{HoodieStorageConfig, HoodieWriteConfig} +import org.apache.hudi.functional.TestColumnStatsIndex.ColumnStatsTestCase import org.apache.hudi.testutils.HoodieClientTestBase import org.apache.hudi.{ColumnStatsIndexSupport, DataSourceWriteOptions} import org.apache.spark.sql._ @@ -35,7 +36,7 @@ import org.apache.spark.sql.types._ import org.junit.jupiter.api.Assertions.{assertEquals, assertNotNull, assertTrue} import org.junit.jupiter.api._ import org.junit.jupiter.params.ParameterizedTest -import org.junit.jupiter.params.provider.ValueSource +import org.junit.jupiter.params.provider.{Arguments, MethodSource} import java.math.BigInteger import java.sql.{Date, Timestamp} @@ -62,6 +63,10 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup initPath() initSparkContexts() initFileSystem() + + setTableName("hoodie_test") + initMetaClient() + spark = sqlContext.sparkSession } @@ -72,24 +77,27 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup } @ParameterizedTest - @ValueSource(booleans = Array(true, false)) - def testMetadataColumnStatsIndex(forceFullLogScan: Boolean): Unit = { + @MethodSource(Array("testMetadataColumnStatsIndexParams")) + def testMetadataColumnStatsIndex(testCase: ColumnStatsTestCase): Unit = { + val metadataOpts = Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true" + ) + val opts = Map( "hoodie.insert.shuffle.parallelism" -> "4", "hoodie.upsert.shuffle.parallelism" -> "4", HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", RECORDKEY_FIELD.key -> "c1", PRECOMBINE_FIELD.key -> "c1", - HoodieMetadataConfig.ENABLE.key -> "true", - HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true", - HoodieMetadataConfig.ENABLE_FULL_SCAN_LOG_FILES.key -> forceFullLogScan.toString, + // NOTE: Currently only this setting is used like following by different MT partitions: + // - Files: using it + // - Column Stats: NOT using it (defaults to doing "point-lookups") + HoodieMetadataConfig.ENABLE_FULL_SCAN_LOG_FILES.key -> testCase.forceFullLogScan.toString, HoodieTableConfig.POPULATE_META_FIELDS.key -> "true" - ) + ) ++ metadataOpts - setTableName("hoodie_test") - initMetaClient() - - val sourceJSONTablePath = getClass.getClassLoader.getResource("index/zorder/input-table-json").toString + val sourceJSONTablePath = getClass.getClassLoader.getResource("index/colstats/input-table-json").toString // NOTE: Schema here is provided for validation that the input date is in the appropriate format val inputDF = spark.read.schema(sourceTableSchema).json(sourceJSONTablePath) @@ -108,10 +116,17 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup metaClient = HoodieTableMetaClient.reload(metaClient) val metadataConfig = HoodieMetadataConfig.newBuilder() - .fromProperties(toProperties(opts)) + .fromProperties(toProperties(metadataOpts)) .build() - val colStatsDF = readColumnStatsIndex(spark, basePath, metadataConfig, sourceTableSchema.fieldNames) + val requestedColumns: Seq[String] = { + // Providing empty seq of columns to [[readColumnStatsIndex]] will lead to the whole + // MT to be read, and subsequently filtered + if (testCase.readFullMetadataTable) Seq.empty + else sourceTableSchema.fieldNames + } + + val colStatsDF = readColumnStatsIndex(spark, basePath, metadataConfig, requestedColumns) val transposedColStatsDF = transposeColumnStatsIndex(spark, colStatsDF, sourceTableSchema.fieldNames, sourceTableSchema) val expectedColStatsSchema = composeIndexSchema(sourceTableSchema.fieldNames, sourceTableSchema) @@ -120,7 +135,7 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup val expectedColStatsIndexTableDf = spark.read .schema(expectedColStatsSchema) - .json(getClass.getClassLoader.getResource("index/zorder/column-stats-index-table.json").toString) + .json(getClass.getClassLoader.getResource("index/colstats/column-stats-index-table.json").toString) assertEquals(expectedColStatsIndexTableDf.schema, transposedColStatsDF.schema) // NOTE: We have to drop the `fileName` column as it contains semi-random components @@ -135,7 +150,7 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup assertEquals(asJson(sort(manualColStatsTableDF)), asJson(sort(transposedColStatsDF))) // do an upsert and validate - val updateJSONTablePath = getClass.getClassLoader.getResource("index/zorder/another-input-table-json").toString + val updateJSONTablePath = getClass.getClassLoader.getResource("index/colstats/another-input-table-json").toString val updateDF = spark.read .schema(sourceTableSchema) .json(updateJSONTablePath) @@ -151,13 +166,13 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup metaClient = HoodieTableMetaClient.reload(metaClient) - val updatedColStatsDF = readColumnStatsIndex(spark, basePath, metadataConfig, sourceTableSchema.fieldNames) + val updatedColStatsDF = readColumnStatsIndex(spark, basePath, metadataConfig, requestedColumns) val transposedUpdatedColStatsDF = transposeColumnStatsIndex(spark, updatedColStatsDF, sourceTableSchema.fieldNames, sourceTableSchema) val expectedColStatsIndexUpdatedDF = spark.read .schema(expectedColStatsSchema) - .json(getClass.getClassLoader.getResource("index/zorder/updated-column-stats-index-table.json").toString) + .json(getClass.getClassLoader.getResource("index/colstats/updated-column-stats-index-table.json").toString) assertEquals(expectedColStatsIndexUpdatedDF.schema, transposedUpdatedColStatsDF.schema) assertEquals(asJson(sort(expectedColStatsIndexUpdatedDF)), asJson(sort(transposedUpdatedColStatsDF.drop("fileName")))) @@ -169,6 +184,153 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup assertEquals(asJson(sort(manualUpdatedColStatsTableDF)), asJson(sort(transposedUpdatedColStatsDF))) } + @Test + def testMetadataColumnStatsIndexPartialProjection(): Unit = { + val targetColumnsToIndex = Seq("c1", "c2", "c3") + + val metadataOpts = Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS.key -> "true", + HoodieMetadataConfig.COLUMN_STATS_INDEX_FOR_COLUMNS.key -> targetColumnsToIndex.mkString(",") + ) + + val opts = Map( + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + RECORDKEY_FIELD.key -> "c1", + PRECOMBINE_FIELD.key -> "c1", + HoodieTableConfig.POPULATE_META_FIELDS.key -> "true" + ) ++ metadataOpts + + val sourceJSONTablePath = getClass.getClassLoader.getResource("index/colstats/input-table-json").toString + + // NOTE: Schema here is provided for validation that the input date is in the appropriate format + val inputDF = spark.read.schema(sourceTableSchema).json(sourceJSONTablePath) + + inputDF + .sort("c1") + .repartition(4, new Column("c1")) + .write + .format("hudi") + .options(opts) + .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024) + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .mode(SaveMode.Overwrite) + .save(basePath) + + metaClient = HoodieTableMetaClient.reload(metaClient) + + val metadataConfig = HoodieMetadataConfig.newBuilder() + .fromProperties(toProperties(metadataOpts)) + .build() + + //////////////////////////////////////////////////////////////////////// + // Case #1: Empty CSI projection + // Projection is requested for columns which are NOT indexed + // by the CSI + //////////////////////////////////////////////////////////////////////// + + { + // These are NOT indexed + val requestedColumns = Seq("c4") + + val emptyColStatsDF = readColumnStatsIndex(spark, basePath, metadataConfig, requestedColumns) + val emptyTransposedColStatsDF = transposeColumnStatsIndex(spark, emptyColStatsDF, requestedColumns, sourceTableSchema) + + assertEquals(0, emptyColStatsDF.collect().length) + assertEquals(0, emptyTransposedColStatsDF.collect().length) + } + + //////////////////////////////////////////////////////////////////////// + // Case #2: Partial CSI projection + // Projection is requested for set of columns some of which are + // NOT indexed by the CSI + //////////////////////////////////////////////////////////////////////// + + { + // We have to include "c1", since we sort the expected outputs by this column + val requestedColumns = Seq("c1", "c4") + + val partialColStatsDF = readColumnStatsIndex(spark, basePath, metadataConfig, requestedColumns) + val partialTransposedColStatsDF = transposeColumnStatsIndex(spark, partialColStatsDF, requestedColumns, sourceTableSchema) + + val targetIndexedColumns = targetColumnsToIndex.intersect(requestedColumns) + val expectedColStatsSchema = composeIndexSchema(targetIndexedColumns, sourceTableSchema) + + // Match against expected column stats table + val expectedColStatsIndexTableDf = + spark.read + .schema(expectedColStatsSchema) + .json(getClass.getClassLoader.getResource("index/colstats/partial-column-stats-index-table.json").toString) + + assertEquals(expectedColStatsIndexTableDf.schema, partialTransposedColStatsDF.schema) + // NOTE: We have to drop the `fileName` column as it contains semi-random components + // that we can't control in this test. Nevertheless, since we manually verify composition of the + // ColStats Index by reading Parquet footers from individual Parquet files, this is not an issue + assertEquals(asJson(sort(expectedColStatsIndexTableDf)), asJson(sort(partialTransposedColStatsDF.drop("fileName")))) + + // Collect Column Stats manually (reading individual Parquet files) + val manualColStatsTableDF = + buildColumnStatsTableManually(basePath, targetIndexedColumns, expectedColStatsSchema) + + assertEquals(asJson(sort(manualColStatsTableDF)), asJson(sort(partialTransposedColStatsDF))) + } + + //////////////////////////////////////////////////////////////////////// + // Case #3: Aligned CSI projection + // Projection is requested for set of columns some of which are + // indexed only for subset of files + //////////////////////////////////////////////////////////////////////// + + { + // NOTE: The update we're writing is intentionally omitting some of the columns + // present in an earlier source + val missingCols = Seq("c2", "c3") + val partialSourceTableSchema = StructType(sourceTableSchema.fields.filterNot(f => missingCols.contains(f.name))) + + val updateJSONTablePath = getClass.getClassLoader.getResource("index/colstats/partial-another-input-table-json").toString + val updateDF = spark.read + .schema(partialSourceTableSchema) + .json(updateJSONTablePath) + + updateDF.repartition(4) + .write + .format("hudi") + .options(opts) + .option(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key, 10 * 1024) + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .mode(SaveMode.Append) + .save(basePath) + + metaClient = HoodieTableMetaClient.reload(metaClient) + + val requestedColumns = sourceTableSchema.fieldNames + + // Nevertheless, the last update was written with a new schema (that is a subset of the original table schema), + // we should be able to read CSI, which will be properly padded (with nulls) after transposition + val updatedColStatsDF = readColumnStatsIndex(spark, basePath, metadataConfig, requestedColumns) + val transposedUpdatedColStatsDF = transposeColumnStatsIndex(spark, updatedColStatsDF, requestedColumns, sourceTableSchema) + + val targetIndexedColumns = targetColumnsToIndex.intersect(requestedColumns) + val expectedColStatsSchema = composeIndexSchema(targetIndexedColumns, sourceTableSchema) + + val expectedColStatsIndexUpdatedDF = + spark.read + .schema(expectedColStatsSchema) + .json(getClass.getClassLoader.getResource("index/colstats/updated-partial-column-stats-index-table.json").toString) + + assertEquals(expectedColStatsIndexUpdatedDF.schema, transposedUpdatedColStatsDF.schema) + assertEquals(asJson(sort(expectedColStatsIndexUpdatedDF)), asJson(sort(transposedUpdatedColStatsDF.drop("fileName")))) + + // Collect Column Stats manually (reading individual Parquet files) + val manualUpdatedColStatsTableDF = + buildColumnStatsTableManually(basePath, targetIndexedColumns, expectedColStatsSchema) + + assertEquals(asJson(sort(manualUpdatedColStatsTableDF)), asJson(sort(transposedUpdatedColStatsDF))) + } + } + @Test def testParquetMetadataRangeExtraction(): Unit = { val df = generateRandomDataFrame(spark) @@ -243,26 +405,6 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup ) } - def bootstrapParquetInputTableFromJSON(sourceJSONTablePath: String, targetParquetTablePath: String): Unit = { - val jsonInputDF = - // NOTE: Schema here is provided for validation that the input date is in the appropriate format - spark.read - .schema(sourceTableSchema) - .json(sourceJSONTablePath) - - jsonInputDF - .sort("c1") - .repartition(4, new Column("c1")) - .write - .format("parquet") - .mode("overwrite") - .save(targetParquetTablePath) - - val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) - // Have to cleanup additional artefacts of Spark write - fs.delete(new Path(targetParquetTablePath, "_SUCCESS"), false) - } - private def generateRandomDataFrame(spark: SparkSession): DataFrame = { val sourceTableSchema = new StructType() @@ -316,3 +458,14 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup } } + +object TestColumnStatsIndex { + + case class ColumnStatsTestCase(forceFullLogScan: Boolean, readFullMetadataTable: Boolean) + + def testMetadataColumnStatsIndexParams: java.util.stream.Stream[Arguments] = + java.util.stream.Stream.of( + Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = false, readFullMetadataTable = false)), + Arguments.arguments(ColumnStatsTestCase(forceFullLogScan = true, readFullMetadataTable = true)) + ) +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala index d8ebe5cbcd8b0..96514603efdcd 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala @@ -30,7 +30,7 @@ import org.apache.hudi.index.HoodieIndex.IndexType import org.apache.hudi.keygen.NonpartitionedKeyGenerator import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config import org.apache.hudi.testutils.{DataSourceTestUtils, HoodieClientTestBase} -import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, HoodieSparkUtils} +import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, HoodieSparkUtils, SparkDatasetMixin} import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD import org.apache.spark.sql._ @@ -48,7 +48,7 @@ import scala.collection.JavaConverters._ /** * Tests on Spark DataSource for MOR table. */ -class TestMORDataSource extends HoodieClientTestBase { +class TestMORDataSource extends HoodieClientTestBase with SparkDatasetMixin { var spark: SparkSession = null private val log = LogManager.getLogger(classOf[TestMORDataSource]) @@ -356,7 +356,7 @@ class TestMORDataSource extends HoodieClientTestBase { val hoodieRecords1 = dataGen.generateInserts("001", 100) - val inputDF1 = toDataset(hoodieRecords1) + val inputDF1 = toDataset(spark, hoodieRecords1) inputDF1.write.format("org.apache.hudi") .options(opts) .option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same @@ -382,7 +382,7 @@ class TestMORDataSource extends HoodieClientTestBase { // Upsert 50 update records // Snopshot view should read 100 records val records2 = dataGen.generateUniqueUpdates("002", 50) - val inputDF2 = toDataset(records2) + val inputDF2 = toDataset(spark, records2) inputDF2.write.format("org.apache.hudi") .options(opts) .mode(SaveMode.Append) @@ -429,7 +429,7 @@ class TestMORDataSource extends HoodieClientTestBase { verifyShow(hudiIncDF1Skipmerge) val record3 = dataGen.generateUpdatesWithTS("003", hoodieRecords1, -1) - val inputDF3 = toDataset(record3) + val inputDF3 = toDataset(spark, record3) inputDF3.write.format("org.apache.hudi").options(opts) .mode(SaveMode.Append).save(basePath) @@ -443,16 +443,6 @@ class TestMORDataSource extends HoodieClientTestBase { assertEquals(0, hudiSnapshotDF3.filter("rider = 'rider-003'").count()) } - private def toDataset(records: util.List[HoodieRecord[_]]) = { - val avroRecords = records.map(_.getData - .asInstanceOf[HoodieRecordPayload[_]] - .getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA) - .get - .asInstanceOf[GenericRecord]) - val rdd: RDD[GenericRecord] = spark.sparkContext.parallelize(avroRecords, 2) - AvroConversionUtils.createDataFrame(rdd, HoodieTestDataGenerator.AVRO_SCHEMA.toString, spark) - } - @Test def testVectorizedReader() { spark.conf.set("spark.sql.parquet.enableVectorizedReader", true) @@ -499,6 +489,28 @@ class TestMORDataSource extends HoodieClientTestBase { hudiSnapshotDF2.show(1) } + @Test def testNoPrecombine() { + // Insert Operation + val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList + val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) + + val commonOptsNoPreCombine = Map( + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4", + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" + ) + inputDF.write.format("hudi") + .options(commonOptsNoPreCombine) + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .option(DataSourceWriteOptions.TABLE_TYPE.key(), "MERGE_ON_READ") + .mode(SaveMode.Overwrite) + .save(basePath) + + spark.read.format("org.apache.hudi").load(basePath).count() + } + @Test def testPreCombineFiledForReadMOR(): Unit = { writeData((1, "a0", 10, 100, false)) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSourceStorage.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSourceStorage.scala index 18b639f2f9bd2..8cf6b4174c9f2 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSourceStorage.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSourceStorage.scala @@ -23,6 +23,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings +import org.apache.hudi.common.util.StringUtils import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.testutils.SparkClientFunctionalTestHarness import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers} @@ -32,7 +33,7 @@ import org.apache.spark.sql.functions.{col, lit} import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} import org.junit.jupiter.api.Tag import org.junit.jupiter.params.ParameterizedTest -import org.junit.jupiter.params.provider.ValueSource +import org.junit.jupiter.params.provider.CsvSource import scala.collection.JavaConversions._ @@ -57,19 +58,28 @@ class TestMORDataSourceStorage extends SparkClientFunctionalTestHarness { val updatedVerificationVal: String = "driver_update" @ParameterizedTest - @ValueSource(booleans = Array(true, false)) - def testMergeOnReadStorage(isMetadataEnabled: Boolean) { - val dataGen = new HoodieTestDataGenerator() + @CsvSource(Array( + "true,", + "true,fare.currency", + "false,", + "false,fare.currency" + )) + def testMergeOnReadStorage(isMetadataEnabled: Boolean, preComineField: String) { + var options: Map[String, String] = commonOpts + + (HoodieMetadataConfig.ENABLE.key -> String.valueOf(isMetadataEnabled)) + if (!StringUtils.isNullOrEmpty(preComineField)) { + options += (DataSourceWriteOptions.PRECOMBINE_FIELD.key() -> preComineField) + } + val dataGen = new HoodieTestDataGenerator(0xDEEF) val fs = FSUtils.getFs(basePath, spark.sparkContext.hadoopConfiguration) // Bulk Insert Operation val records1 = recordsToStrings(dataGen.generateInserts("001", 100)).toList val inputDF1: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records1, 2)) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(options) .option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) - .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) .mode(SaveMode.Overwrite) .save(basePath) @@ -90,8 +100,7 @@ class TestMORDataSourceStorage extends SparkClientFunctionalTestHarness { val records2 = recordsToStrings(dataGen.generateUniqueUpdates("002", 100)).toList val inputDF2: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records2, 2)) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) - .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) + .options(options) .mode(SaveMode.Append) .save(basePath) @@ -110,8 +119,7 @@ class TestMORDataSourceStorage extends SparkClientFunctionalTestHarness { val inputDF3 = hudiSnapshotDF2.filter(col("_row_key") === verificationRowKey).withColumn(verificationCol, lit(updatedVerificationVal)) inputDF3.write.format("org.apache.hudi") - .options(commonOpts) - .option(HoodieMetadataConfig.ENABLE.key, isMetadataEnabled) + .options(options) .mode(SaveMode.Append) .save(basePath) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala index 2cdd7880bfec8..945d26be3f464 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala @@ -19,7 +19,7 @@ package org.apache.hudi.functional import org.apache.avro.Schema import org.apache.hudi.common.config.HoodieMetadataConfig -import org.apache.hudi.common.model.{HoodieRecord, OverwriteNonDefaultsWithLatestAvroPayload, OverwriteWithLatestAvroPayload} +import org.apache.hudi.common.model.{HoodieRecord, OverwriteNonDefaultsWithLatestAvroPayload} import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.common.testutils.{HadoopMapRedUtils, HoodieTestDataGenerator} import org.apache.hudi.config.{HoodieStorageConfig, HoodieWriteConfig} @@ -31,7 +31,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.{Dataset, HoodieUnsafeRDDUtils, Row, SaveMode} import org.junit.jupiter.api.Assertions.{assertEquals, fail} -import org.junit.jupiter.api.{Tag, Test} +import org.junit.jupiter.api.{Disabled, Tag, Test} import scala.collection.JavaConverters._ @@ -53,6 +53,7 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> classOf[NonpartitionedKeyGenerator].getName ) + @Disabled("HUDI-3896") @Test def testBaseFileOnlyViewRelation(): Unit = { val tablePath = s"$basePath/cow" @@ -129,7 +130,8 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with fail("Only Spark 3 and Spark 2 are currently supported") // Test MOR / Read Optimized - runTest(tableState, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, "null", projectedColumnsReadStatsReadOptimized) + // TODO(HUDI-3896) re-enable + //runTest(tableState, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, "null", projectedColumnsReadStatsReadOptimized) } @Test @@ -184,7 +186,8 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with fail("Only Spark 3 and Spark 2 are currently supported") // Test MOR / Read Optimized - runTest(tableState, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, "null", projectedColumnsReadStatsReadOptimized) + // TODO(HUDI-3896) re-enable + //runTest(tableState, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, "null", projectedColumnsReadStatsReadOptimized) } @Test @@ -329,7 +332,7 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with logWarning(s"Not matching bytes read ($bytesRead)") } - val readColumns = targetColumns ++ relation.mandatoryColumns + val readColumns = targetColumns ++ relation.mandatoryFields val (_, projectedStructType, _) = HoodieSparkUtils.getRequiredSchema(tableState.schema, readColumns) val row: InternalRow = rows.take(1).head diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala index fdff6928a215f..ecbbadeeb9a28 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala @@ -52,7 +52,7 @@ class TestAlterTableDropPartition extends TestHoodieSqlBase { checkAnswer(s"show partitions $tableName")(Seq.empty: _*) } - test("Purge drop non-partitioned table") { + test("Lazy Clean drop non-partitioned table") { val tableName = generateTableName // create table spark.sql( @@ -66,13 +66,14 @@ class TestAlterTableDropPartition extends TestHoodieSqlBase { | using hudi | tblproperties ( | primaryKey = 'id', - | preCombineField = 'ts' + | preCombineField = 'ts', + | hoodie.cleaner.commits.retained= '1' | ) |""".stripMargin) // insert data spark.sql(s"""insert into $tableName values (1, "z3", "v1", "2021-10-01"), (2, "l4", "v1", "2021-10-02")""") - checkExceptionContain(s"alter table $tableName drop partition (dt='2021-10-01') purge")( + checkExceptionContain(s"alter table $tableName drop partition (dt='2021-10-01')")( s"$tableName is a non-partitioned table that is not allowed to drop partition") // show partitions @@ -131,14 +132,13 @@ class TestAlterTableDropPartition extends TestHoodieSqlBase { } Seq(false, true).foreach { urlencode => - test(s"Purge drop single-partition table' partitions, urlencode: $urlencode") { + test(s"Lazy Clean drop single-partition table' partitions, urlencode: $urlencode") { withTempDir { tmp => val tableName = generateTableName val tablePath = s"${tmp.getCanonicalPath}/$tableName" import spark.implicits._ - val df = Seq((1, "z3", "v1", "2021/10/01"), (2, "l4", "v1", "2021/10/02")) - .toDF("id", "name", "ts", "dt") + val df = Seq((1, "z3", "v1", "2021/10/01")).toDF("id", "name", "ts", "dt") df.write.format("hudi") .option(HoodieWriteConfig.TBL_NAME.key, tableName) @@ -158,17 +158,24 @@ class TestAlterTableDropPartition extends TestHoodieSqlBase { s""" |create table $tableName using hudi |location '$tablePath' + | tblproperties ( + | primaryKey = 'id', + | preCombineField = 'ts', + | hoodie.cleaner.commits.retained= '1' + | ) |""".stripMargin) // drop 2021-10-01 partition - spark.sql(s"alter table $tableName drop partition (dt='2021/10/01') purge") + spark.sql(s"alter table $tableName drop partition (dt='2021/10/01')") + + spark.sql(s"""insert into $tableName values (2, "l4", "v1", "2021/10/02")""") val partitionPath = if (urlencode) { PartitionPathEncodeUtils.escapePathName("2021/10/01") } else { "2021/10/01" } - checkAnswer(s"select dt from $tableName")(Seq(s"2021/10/02")) + checkAnswer(s"select dt from $tableName")(Seq("2021/10/02")) assertResult(false)(existsPath(s"${tmp.getCanonicalPath}/$tableName/$partitionPath")) // show partitions @@ -267,14 +274,13 @@ class TestAlterTableDropPartition extends TestHoodieSqlBase { } Seq(false, true).foreach { hiveStyle => - test(s"Purge drop multi-level partitioned table's partitions, isHiveStylePartitioning: $hiveStyle") { + test(s"Lazy Clean drop multi-level partitioned table's partitions, isHiveStylePartitioning: $hiveStyle") { withTempDir { tmp => val tableName = generateTableName val tablePath = s"${tmp.getCanonicalPath}/$tableName" import spark.implicits._ - val df = Seq((1, "z3", "v1", "2021", "10", "01"), (2, "l4", "v1", "2021", "10", "02")) - .toDF("id", "name", "ts", "year", "month", "day") + val df = Seq((1, "z3", "v1", "2021", "10", "01")).toDF("id", "name", "ts", "year", "month", "day") df.write.format("hudi") .option(HoodieWriteConfig.TBL_NAME.key, tableName) @@ -294,14 +300,23 @@ class TestAlterTableDropPartition extends TestHoodieSqlBase { s""" |create table $tableName using hudi |location '$tablePath' + | tblproperties ( + | primaryKey = 'id', + | preCombineField = 'ts', + | hoodie.cleaner.commits.retained= '1' + | ) |""".stripMargin) // drop 2021-10-01 partition - spark.sql(s"alter table $tableName drop partition (year='2021', month='10', day='01') purge") + spark.sql(s"alter table $tableName drop partition (year='2021', month='10', day='01')") + + // insert data + spark.sql(s"""insert into $tableName values (2, "l4", "v1", "2021", "10", "02")""") checkAnswer(s"select id, name, ts, year, month, day from $tableName")( Seq(2, "l4", "v1", "2021", "10", "02") ) + assertResult(false)(existsPath( s"${tmp.getCanonicalPath}/$tableName/year=2021/month=10/day=01")) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala index 8ebef198af458..6b8efb84e32f1 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCreateTable.scala @@ -263,83 +263,100 @@ class TestCreateTable extends TestHoodieSqlBase { test("Test Create Table As Select") { withTempDir { tmp => - // Create Non-Partitioned table - val tableName1 = generateTableName - spark.sql( - s""" - | create table $tableName1 using hudi - | tblproperties(primaryKey = 'id') - | location '${tmp.getCanonicalPath}/$tableName1' - | AS - | select 1 as id, 'a1' as name, 10 as price, 1000 as ts + Seq("cow", "mor").foreach { tableType => + // Create Non-Partitioned table + val tableName1 = generateTableName + spark.sql( + s""" + | create table $tableName1 using hudi + | tblproperties( + | primaryKey = 'id', + | type = '$tableType' + | ) + | location '${tmp.getCanonicalPath}/$tableName1' + | AS + | select 1 as id, 'a1' as name, 10 as price, 1000 as ts """.stripMargin) - checkAnswer(s"select id, name, price, ts from $tableName1")( - Seq(1, "a1", 10.0, 1000) - ) + checkAnswer(s"select id, name, price, ts from $tableName1")( + Seq(1, "a1", 10.0, 1000) + ) - // Create Partitioned table - val tableName2 = generateTableName - spark.sql( - s""" - | create table $tableName2 using hudi - | partitioned by (dt) - | tblproperties(primaryKey = 'id') - | location '${tmp.getCanonicalPath}/$tableName2' - | AS - | select 1 as id, 'a1' as name, 10 as price, '2021-04-01' as dt + // Create Partitioned table + val tableName2 = generateTableName + spark.sql( + s""" + | create table $tableName2 using hudi + | partitioned by (dt) + | tblproperties( + | primaryKey = 'id', + | type = '$tableType' + | ) + | location '${tmp.getCanonicalPath}/$tableName2' + | AS + | select 1 as id, 'a1' as name, 10 as price, '2021-04-01' as dt """.stripMargin - ) - checkAnswer(s"select id, name, price, dt from $tableName2")( - Seq(1, "a1", 10, "2021-04-01") - ) + ) + checkAnswer(s"select id, name, price, dt from $tableName2")( + Seq(1, "a1", 10, "2021-04-01") + ) - // Create Partitioned table with timestamp data type - val tableName3 = generateTableName - // CTAS failed with null primaryKey - assertThrows[Exception] { + // Create Partitioned table with timestamp data type + val tableName3 = generateTableName + // CTAS failed with null primaryKey + assertThrows[Exception] { + spark.sql( + s""" + | create table $tableName3 using hudi + | partitioned by (dt) + | tblproperties( + | primaryKey = 'id', + | type = '$tableType' + | ) + | location '${tmp.getCanonicalPath}/$tableName3' + | AS + | select null as id, 'a1' as name, 10 as price, '2021-05-07' as dt + | + """.stripMargin + ) + } + // Create table with timestamp type partition spark.sql( s""" | create table $tableName3 using hudi | partitioned by (dt) - | tblproperties(primaryKey = 'id') + | tblproperties( + | primaryKey = 'id', + | type = '$tableType' + | ) | location '${tmp.getCanonicalPath}/$tableName3' | AS - | select null as id, 'a1' as name, 10 as price, '2021-05-07' as dt - | - """.stripMargin - ) - } - // Create table with timestamp type partition - spark.sql( - s""" - | create table $tableName3 using hudi - | partitioned by (dt) - | tblproperties(primaryKey = 'id') - | location '${tmp.getCanonicalPath}/$tableName3' - | AS - | select cast('2021-05-06 00:00:00' as timestamp) as dt, 1 as id, 'a1' as name, 10 as - | price + | select cast('2021-05-06 00:00:00' as timestamp) as dt, 1 as id, 'a1' as name, 10 as + | price """.stripMargin - ) - checkAnswer(s"select id, name, price, cast(dt as string) from $tableName3")( - Seq(1, "a1", 10, "2021-05-06 00:00:00") - ) - // Create table with date type partition - val tableName4 = generateTableName - spark.sql( - s""" - | create table $tableName4 using hudi - | partitioned by (dt) - | tblproperties(primaryKey = 'id') - | location '${tmp.getCanonicalPath}/$tableName4' - | AS - | select cast('2021-05-06' as date) as dt, 1 as id, 'a1' as name, 10 as - | price + ) + checkAnswer(s"select id, name, price, cast(dt as string) from $tableName3")( + Seq(1, "a1", 10, "2021-05-06 00:00:00") + ) + // Create table with date type partition + val tableName4 = generateTableName + spark.sql( + s""" + | create table $tableName4 using hudi + | partitioned by (dt) + | tblproperties( + | primaryKey = 'id', + | type = '$tableType' + | ) + | location '${tmp.getCanonicalPath}/$tableName4' + | AS + | select cast('2021-05-06' as date) as dt, 1 as id, 'a1' as name, 10 as + | price """.stripMargin - ) - checkAnswer(s"select id, name, price, cast(dt as string) from $tableName4")( - Seq(1, "a1", 10, "2021-05-06") - ) + ) + checkAnswer(s"select id, name, price, cast(dt as string) from $tableName4")( + Seq(1, "a1", 10, "2021-05-06") + ) + } } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala index 9c693f9626090..b2e888a5f3140 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestDeleteTable.scala @@ -17,6 +17,11 @@ package org.apache.spark.sql.hudi +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.keygen.SimpleKeyGenerator +import org.apache.spark.sql.SaveMode + class TestDeleteTable extends TestHoodieSqlBase { test("Test Delete Table") { @@ -198,4 +203,46 @@ class TestDeleteTable extends TestHoodieSqlBase { } } } + + Seq(false, true).foreach { urlencode => + test(s"Test Delete single-partition table' partitions, urlencode: $urlencode") { + withTempDir { tmp => + val tableName = generateTableName + val tablePath = s"${tmp.getCanonicalPath}/$tableName" + + import spark.implicits._ + val df = Seq((1, "z3", "v1", "2021/10/01"), (2, "l4", "v1", "2021/10/02")) + .toDF("id", "name", "ts", "dt") + + df.write.format("hudi") + .option(HoodieWriteConfig.TBL_NAME.key, tableName) + .option(TABLE_TYPE.key, MOR_TABLE_TYPE_OPT_VAL) + .option(RECORDKEY_FIELD.key, "id") + .option(PRECOMBINE_FIELD.key, "ts") + .option(PARTITIONPATH_FIELD.key, "dt") + .option(URL_ENCODE_PARTITIONING.key(), urlencode) + .option(KEYGENERATOR_CLASS_NAME.key, classOf[SimpleKeyGenerator].getName) + .option(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "1") + .option(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "1") + .mode(SaveMode.Overwrite) + .save(tablePath) + + // register meta to spark catalog by creating table + spark.sql( + s""" + |create table $tableName using hudi + |location '$tablePath' + |""".stripMargin) + + // delete 2021-10-01 partition + if (urlencode) { + spark.sql(s"""delete from $tableName where dt="2021/10/01"""") + } else { + spark.sql(s"delete from $tableName where dt='2021/10/01'") + } + + checkAnswer(s"select dt from $tableName")(Seq(s"2021/10/02")) + } + } + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala index ae828ed9f7305..54163635984bf 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestSpark3DDL.scala @@ -445,28 +445,19 @@ class TestSpark3DDL extends TestHoodieSqlBase { Seq(null), Seq(Map("t1" -> 10.0d)) ) + spark.sql(s"alter table ${tableName} rename column members to mem") + spark.sql(s"alter table ${tableName} rename column mem.value.n to nn") + spark.sql(s"alter table ${tableName} rename column userx to us") + spark.sql(s"alter table ${tableName} rename column us.age to age1") + + spark.sql(s"insert into ${tableName} values(2 , map('k1', struct('v1', 100), 'k2', struct('v2', 200)), struct('jackStructNew', 291 , 101), 'jacknew', 1000, map('t1', 10))") + spark.sql(s"select mem.value.nn, us.age1 from $tableName order by id").show() + checkAnswer(spark.sql(s"select mem.value.nn, us.age1 from $tableName order by id").collect())( + Seq(null, 29), + Seq(null, 291) + ) } } } } - - private def performClustering(writeDf: DataFrame, basePath: String, tableName: String, tableType: String): Unit = { - writeDf.write.format("org.apache.hudi") - .option(DataSourceWriteOptions.TABLE_TYPE.key(), tableType) - .option("hoodie.upsert.shuffle.parallelism", "1") - .option(DataSourceWriteOptions.RECORDKEY_FIELD.key(), "id") - .option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "comb") - .option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key(), "par") - .option(HoodieWriteConfig.TBL_NAME.key, tableName) - .option("hoodie.schema.on.read.enable", "true") - // option for clustering - .option("hoodie.clustering.inline", "true") - .option("hoodie.clustering.inline.max.commits", "1") - .option("hoodie.clustering.plan.strategy.small.file.limit", String.valueOf(2*1024*1024L)) - .option("hoodie.clustering.plan.strategy.max.bytes.per.group", String.valueOf(10*1024*1024L)) - .option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(4 * 1024* 1024L)) - .option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "col1, col2") - .mode(SaveMode.Append) - .save(basePath) - } } diff --git a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala index e4b3c4010a5e1..0e74c997d7ee4 100644 --- a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala +++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Join, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, Spark24HoodieParquetFileFormat} import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile, Spark2ParsePartitionUtil, SparkParsePartitionUtil} import org.apache.spark.sql.hudi.SparkAdapter import org.apache.spark.sql.hudi.parser.HoodieSpark2ExtendedSqlParser @@ -165,7 +165,7 @@ class Spark2Adapter extends SparkAdapter { } } - override def createHoodieParquetFileFormat(): Option[ParquetFileFormat] = { - Some(new ParquetFileFormat) + override def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { + Some(new Spark24HoodieParquetFileFormat(appendPartitionValues)) } } diff --git a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala new file mode 100644 index 0000000000000..6fb5c50c03a2b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark24HoodieParquetFileFormat.scala @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.lib.input.FileSplit +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl +import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType} +import org.apache.parquet.filter2.compat.FilterCompat +import org.apache.parquet.filter2.predicate.FilterApi +import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS +import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat, ParquetRecordReader} +import org.apache.spark.TaskContext +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.avro.AvroDeserializer +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.expressions.{JoinedRow, UnsafeRow} +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.execution.datasources.{PartitionedFile, RecordReaderIterator} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.{AtomicType, StructType} +import org.apache.spark.util.SerializableConfiguration + +import java.net.URI + +/** + * This class is an extension of [[ParquetFileFormat]] overriding Spark-specific behavior + * that's not possible to customize in any other way + * + * NOTE: This is a version of [[AvroDeserializer]] impl from Spark 2.4.4 w/ w/ the following changes applied to it: + *
    + *
  1. Avoiding appending partition values to the rows read from the data file
  2. + *
+ */ +class Spark24HoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean) extends ParquetFileFormat { + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + hadoopConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName) + hadoopConf.set( + ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, + requiredSchema.json) + hadoopConf.set( + ParquetWriteSupport.SPARK_ROW_SCHEMA, + requiredSchema.json) + hadoopConf.set( + SQLConf.SESSION_LOCAL_TIMEZONE.key, + sparkSession.sessionState.conf.sessionLocalTimeZone) + hadoopConf.setBoolean( + SQLConf.CASE_SENSITIVE.key, + sparkSession.sessionState.conf.caseSensitiveAnalysis) + + ParquetWriteSupport.setSchema(requiredSchema, hadoopConf) + + // Sets flags for `ParquetToSparkSchemaConverter` + hadoopConf.setBoolean( + SQLConf.PARQUET_BINARY_AS_STRING.key, + sparkSession.sessionState.conf.isParquetBinaryAsString) + hadoopConf.setBoolean( + SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, + sparkSession.sessionState.conf.isParquetINT96AsTimestamp) + + val broadcastedHadoopConf = + sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) + + // TODO: if you move this into the closure it reverts to the default values. + // If true, enable using the custom RecordReader for parquet. This only works for + // a subset of the types (no complex types). + val resultSchema = StructType(partitionSchema.fields ++ requiredSchema.fields) + val sqlConf = sparkSession.sessionState.conf + val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled + val enableVectorizedReader: Boolean = + sqlConf.parquetVectorizedReaderEnabled && + resultSchema.forall(_.dataType.isInstanceOf[AtomicType]) + val enableRecordFilter: Boolean = sqlConf.parquetRecordFilterEnabled + val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion + val capacity = sqlConf.parquetVectorizedReaderBatchSize + val enableParquetFilterPushDown: Boolean = sqlConf.parquetFilterPushDown + // Whole stage codegen (PhysicalRDD) is able to deal with batches directly + val returningBatch = supportBatch(sparkSession, resultSchema) + val pushDownDate = sqlConf.parquetFilterPushDownDate + val pushDownTimestamp = sqlConf.parquetFilterPushDownTimestamp + val pushDownDecimal = sqlConf.parquetFilterPushDownDecimal + val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith + val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold + val isCaseSensitive = sqlConf.caseSensitiveAnalysis + + (file: PartitionedFile) => { + assert(!shouldAppendPartitionValues || file.partitionValues.numFields == partitionSchema.size) + + val fileSplit = + new FileSplit(new Path(new URI(file.filePath)), file.start, file.length, Array.empty) + val filePath = fileSplit.getPath + + val split = + new org.apache.parquet.hadoop.ParquetInputSplit( + filePath, + fileSplit.getStart, + fileSplit.getStart + fileSplit.getLength, + fileSplit.getLength, + fileSplit.getLocations, + null) + + val sharedConf = broadcastedHadoopConf.value.value + + lazy val footerFileMetaData = + ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData + // Try to push down filters when filter push-down is enabled. + val pushed = if (enableParquetFilterPushDown) { + val parquetSchema = footerFileMetaData.getSchema + val parquetFilters = new ParquetFilters(pushDownDate, pushDownTimestamp, pushDownDecimal, + pushDownStringStartWith, pushDownInFilterThreshold, isCaseSensitive) + filters + // Collects all converted Parquet filter predicates. Notice that not all predicates can be + // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap` + // is used here. + .flatMap(parquetFilters.createFilter(parquetSchema, _)) + .reduceOption(FilterApi.and) + } else { + None + } + + // PARQUET_INT96_TIMESTAMP_CONVERSION says to apply timezone conversions to int96 timestamps' + // *only* if the file was created by something other than "parquet-mr", so check the actual + // writer here for this file. We have to do this per-file, as each file in the table may + // have different writers. + // Define isCreatedByParquetMr as function to avoid unnecessary parquet footer reads. + def isCreatedByParquetMr: Boolean = + footerFileMetaData.getCreatedBy().startsWith("parquet-mr") + + val convertTz = + if (timestampConversion && !isCreatedByParquetMr) { + Some(DateTimeUtils.getTimeZone(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) + } else { + None + } + + val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) + val hadoopAttemptContext = + new TaskAttemptContextImpl(broadcastedHadoopConf.value.value, attemptId) + + // Try to push down filters when filter push-down is enabled. + // Notice: This push-down is RowGroups level, not individual records. + if (pushed.isDefined) { + ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get) + } + val taskContext = Option(TaskContext.get()) + if (enableVectorizedReader) { + val vectorizedReader = new VectorizedParquetRecordReader( + convertTz.orNull, enableOffHeapColumnVector && taskContext.isDefined, capacity) + val iter = new RecordReaderIterator(vectorizedReader) + // SPARK-23457 Register a task completion lister before `initialization`. + taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) + vectorizedReader.initialize(split, hadoopAttemptContext) + logDebug(s"Appending $partitionSchema ${file.partitionValues}") + + // NOTE: We're making appending of the partitioned values to the rows read from the + // data file configurable + if (shouldAppendPartitionValues) { + vectorizedReader.initBatch(partitionSchema, file.partitionValues) + } else { + vectorizedReader.initBatch(StructType(Nil), InternalRow.empty) + } + + if (returningBatch) { + vectorizedReader.enableReturningBatches() + } + + // UnsafeRowParquetRecordReader appends the columns internally to avoid another copy. + iter.asInstanceOf[Iterator[InternalRow]] + } else { + logDebug(s"Falling back to parquet-mr") + // ParquetRecordReader returns UnsafeRow + val reader = if (pushed.isDefined && enableRecordFilter) { + val parquetFilter = FilterCompat.get(pushed.get, null) + new ParquetRecordReader[UnsafeRow](new ParquetReadSupport(convertTz), parquetFilter) + } else { + new ParquetRecordReader[UnsafeRow](new ParquetReadSupport(convertTz)) + } + val iter = new RecordReaderIterator(reader) + // SPARK-23457 Register a task completion lister before `initialization`. + taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) + reader.initialize(split, hadoopAttemptContext) + + val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes + val joinedRow = new JoinedRow() + val appendPartitionColumns = GenerateUnsafeProjection.generate(fullSchema, fullSchema) + + // This is a horrible erasure hack... if we type the iterator above, then it actually check + // the type in next() and we get a class cast exception. If we make that function return + // Object, then we can defer the cast until later! + // + // NOTE: We're making appending of the partitioned values to the rows read from the + // data file configurable + if (!shouldAppendPartitionValues || partitionSchema.length == 0) { + // There is no partition columns + iter.asInstanceOf[Iterator[InternalRow]] + } else { + iter.asInstanceOf[Iterator[InternalRow]] + .map(d => appendPartitionColumns(joinedRow(d, file.partitionValues))) + } + + } + } + } +} diff --git a/hudi-spark-datasource/hudi-spark2/src/test/resources/log4j-surefire.properties b/hudi-spark-datasource/hudi-spark2/src/test/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-spark-datasource/hudi-spark2/src/test/resources/log4j-surefire.properties +++ b/hudi-spark-datasource/hudi-spark2/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/execution/datasources/Spark3ParsePartitionUtil.scala b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/execution/datasources/Spark3ParsePartitionUtil.scala index 6f29053aef212..f0cbe0530f3e2 100644 --- a/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/execution/datasources/Spark3ParsePartitionUtil.scala +++ b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/execution/datasources/Spark3ParsePartitionUtil.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources -import java.lang.{Double => JDouble, Long => JLong} +import java.lang.{Boolean => JBoolean, Double => JDouble, Long => JLong} import java.math.{BigDecimal => JBigDecimal} import java.time.ZoneId import java.util.{Locale, TimeZone} @@ -253,6 +253,7 @@ class Spark3ParsePartitionUtil(conf: SQLConf) extends SparkParsePartitionUtil { zoneId: ZoneId): Any = desiredType match { case _ if value == DEFAULT_PARTITION_PATH => null case NullType => null + case BooleanType => JBoolean.parseBoolean(value) case StringType => UTF8String.fromString(unescapePathName(value)) case IntegerType => Integer.parseInt(value) case LongType => JLong.parseLong(value) diff --git a/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala b/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala index 13dba82488271..22431cb2574a3 100644 --- a/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala @@ -19,14 +19,13 @@ package org.apache.spark.sql.adapter import org.apache.avro.Schema -import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSchemaConverters, HoodieAvroSerializer, HoodieSpark3_1AvroDeserializer, HoodieSpark3_1AvroSerializer, HoodieSparkAvroSchemaConverters} -import org.apache.spark.sql.hudi.SparkAdapter -import org.apache.spark.sql.types.DataType -import org.apache.spark.sql.{HoodieCatalystExpressionUtils, HoodieSpark3_1CatalystExpressionUtils} import org.apache.spark.SPARK_VERSION -import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSerializer, HoodieSpark3_1AvroDeserializer, HoodieSpark3_1AvroSerializer} import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, Spark31HoodieParquetFileFormat} +import org.apache.spark.sql.hudi.SparkAdapter +import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{HoodieCatalystExpressionUtils, HoodieSpark3_1CatalystExpressionUtils, SparkSession} /** @@ -55,14 +54,7 @@ class Spark3_1Adapter extends BaseSpark3Adapter { } } - override def createHoodieParquetFileFormat(): Option[ParquetFileFormat] = { - if (SPARK_VERSION.startsWith("3.1")) { - val loadClassName = "org.apache.spark.sql.execution.datasources.parquet.Spark312HoodieParquetFileFormat" - val clazz = Class.forName(loadClassName, true, Thread.currentThread().getContextClassLoader) - val ctor = clazz.getConstructors.head - Some(ctor.newInstance().asInstanceOf[ParquetFileFormat]) - } else { - None - } + override def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { + Some(new Spark31HoodieParquetFileFormat(appendPartitionValues)) } } diff --git a/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark312HoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark312HoodieParquetFileFormat.scala deleted file mode 100644 index 83b3162bbc328..0000000000000 --- a/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark312HoodieParquetFileFormat.scala +++ /dev/null @@ -1,365 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.datasources.parquet - -import java.net.URI -import java.util -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path -import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl -import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType} -import org.apache.hudi.client.utils.SparkInternalSchemaConverter -import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.HoodieSparkUtils -import org.apache.hudi.common.util.InternalSchemaCache -import org.apache.hudi.common.util.collection.Pair -import org.apache.hudi.internal.schema.InternalSchema -import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper} -import org.apache.hudi.internal.schema.action.InternalSchemaMerger -import org.apache.parquet.filter2.compat.FilterCompat -import org.apache.parquet.filter2.predicate.FilterApi -import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS -import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat, ParquetRecordReader} - -import org.apache.spark.TaskContext -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow} -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator} -import org.apache.spark.sql.execution.datasources.parquet._ -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.{AtomicType, DataType, StructField, StructType} -import org.apache.spark.util.SerializableConfiguration - -class Spark312HoodieParquetFileFormat extends ParquetFileFormat { - - // reference ParquetFileFormat from spark project - override def buildReaderWithPartitionValues( - sparkSession: SparkSession, - dataSchema: StructType, - partitionSchema: StructType, - requiredSchema: StructType, - filters: Seq[Filter], - options: Map[String, String], - hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { - if (hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, "").isEmpty) { - // fallback to origin parquet File read - super.buildReaderWithPartitionValues(sparkSession, dataSchema, partitionSchema, requiredSchema, filters, options, hadoopConf) - } else { - hadoopConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName) - hadoopConf.set( - ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, - requiredSchema.json) - hadoopConf.set( - ParquetWriteSupport.SPARK_ROW_SCHEMA, - requiredSchema.json) - hadoopConf.set( - SQLConf.SESSION_LOCAL_TIMEZONE.key, - sparkSession.sessionState.conf.sessionLocalTimeZone) - hadoopConf.setBoolean( - SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key, - sparkSession.sessionState.conf.nestedSchemaPruningEnabled) - hadoopConf.setBoolean( - SQLConf.CASE_SENSITIVE.key, - sparkSession.sessionState.conf.caseSensitiveAnalysis) - - ParquetWriteSupport.setSchema(requiredSchema, hadoopConf) - - // Sets flags for `ParquetToSparkSchemaConverter` - hadoopConf.setBoolean( - SQLConf.PARQUET_BINARY_AS_STRING.key, - sparkSession.sessionState.conf.isParquetBinaryAsString) - hadoopConf.setBoolean( - SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, - sparkSession.sessionState.conf.isParquetINT96AsTimestamp) - // for dataSource v1, we have no method to do project for spark physical plan. - // it's safe to do cols project here. - val internalSchemaString = hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) - val querySchemaOption = SerDeHelper.fromJson(internalSchemaString) - if (querySchemaOption.isPresent && !requiredSchema.isEmpty) { - val prunedSchema = SparkInternalSchemaConverter.convertAndPruneStructTypeToInternalSchema(requiredSchema, querySchemaOption.get()) - hadoopConf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, SerDeHelper.toJson(prunedSchema)) - } - val broadcastedHadoopConf = - sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) - - // TODO: if you move this into the closure it reverts to the default values. - // If true, enable using the custom RecordReader for parquet. This only works for - // a subset of the types (no complex types). - val resultSchema = StructType(partitionSchema.fields ++ requiredSchema.fields) - val sqlConf = sparkSession.sessionState.conf - val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled - val enableVectorizedReader: Boolean = - sqlConf.parquetVectorizedReaderEnabled && - resultSchema.forall(_.dataType.isInstanceOf[AtomicType]) - val enableRecordFilter: Boolean = sqlConf.parquetRecordFilterEnabled - val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion - val capacity = sqlConf.parquetVectorizedReaderBatchSize - val enableParquetFilterPushDown: Boolean = sqlConf.parquetFilterPushDown - // Whole stage codegen (PhysicalRDD) is able to deal with batches directly - val returningBatch = supportBatch(sparkSession, resultSchema) - val pushDownDate = sqlConf.parquetFilterPushDownDate - val pushDownTimestamp = sqlConf.parquetFilterPushDownTimestamp - val pushDownDecimal = sqlConf.parquetFilterPushDownDecimal - val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith - val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold - val isCaseSensitive = sqlConf.caseSensitiveAnalysis - - (file: PartitionedFile) => { - assert(file.partitionValues.numFields == partitionSchema.size) - val filePath = new Path(new URI(file.filePath)) - val split = - new org.apache.parquet.hadoop.ParquetInputSplit( - filePath, - file.start, - file.start + file.length, - file.length, - Array.empty, - null) - val sharedConf = broadcastedHadoopConf.value.value - // do deal with internalSchema - val internalSchemaString = sharedConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) - // querySchema must be a pruned schema. - val querySchemaOption = SerDeHelper.fromJson(internalSchemaString) - val internalSchemaChangeEnabled = if (internalSchemaString.isEmpty || !querySchemaOption.isPresent) false else true - val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH) - val commitInstantTime = FSUtils.getCommitTime(filePath.getName).toLong; - val fileSchema = if (internalSchemaChangeEnabled) { - val validCommits = sharedConf.get(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST) - InternalSchemaCache.getInternalSchemaByVersionId(commitInstantTime, tablePath, sharedConf, if (validCommits == null) "" else validCommits) - } else { - // this should not happened, searchSchemaAndCache will deal with correctly. - null - } - - lazy val footerFileMetaData = - ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData - val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode( - footerFileMetaData.getKeyValueMetaData.get, - SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ)) - // Try to push down filters when filter push-down is enabled. - val pushed = if (enableParquetFilterPushDown) { - val parquetSchema = footerFileMetaData.getSchema - val parquetFilters = if (HoodieSparkUtils.gteqSpark3_1_3) { - Spark312HoodieParquetFileFormat.createParquetFilters( - parquetSchema, - pushDownDate, - pushDownTimestamp, - pushDownDecimal, - pushDownStringStartWith, - pushDownInFilterThreshold, - isCaseSensitive, - datetimeRebaseMode) - } else { - Spark312HoodieParquetFileFormat.createParquetFilters( - parquetSchema, - pushDownDate, - pushDownTimestamp, - pushDownDecimal, - pushDownStringStartWith, - pushDownInFilterThreshold, - isCaseSensitive) - } - filters.map(Spark312HoodieParquetFileFormat.rebuildFilterFromParquet(_, fileSchema, querySchemaOption.get())) - // Collects all converted Parquet filter predicates. Notice that not all predicates can be - // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap` - // is used here. - .flatMap(parquetFilters.createFilter(_)) - .reduceOption(FilterApi.and) - } else { - None - } - - // PARQUET_INT96_TIMESTAMP_CONVERSION says to apply timezone conversions to int96 timestamps' - // *only* if the file was created by something other than "parquet-mr", so check the actual - // writer here for this file. We have to do this per-file, as each file in the table may - // have different writers. - // Define isCreatedByParquetMr as function to avoid unnecessary parquet footer reads. - def isCreatedByParquetMr: Boolean = - footerFileMetaData.getCreatedBy().startsWith("parquet-mr") - - val convertTz = - if (timestampConversion && !isCreatedByParquetMr) { - Some(DateTimeUtils.getZoneId(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) - } else { - None - } - val int96RebaseMode = DataSourceUtils.int96RebaseMode( - footerFileMetaData.getKeyValueMetaData.get, - SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_READ)) - - val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) - // use new conf - val hadoopAttempConf = new Configuration(broadcastedHadoopConf.value.value) - // - // reset request schema - var typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = new java.util.HashMap() - if (internalSchemaChangeEnabled) { - val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema() - val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema) - typeChangeInfos = SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema) - hadoopAttempConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json) - } - val hadoopAttemptContext = - new TaskAttemptContextImpl(hadoopAttempConf, attemptId) - - // Try to push down filters when filter push-down is enabled. - // Notice: This push-down is RowGroups level, not individual records. - if (pushed.isDefined) { - ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get) - } - val taskContext = Option(TaskContext.get()) - if (enableVectorizedReader) { - val vectorizedReader = new Spark312HoodieVectorizedParquetRecordReader( - convertTz.orNull, - datetimeRebaseMode.toString, - int96RebaseMode.toString, - enableOffHeapColumnVector && taskContext.isDefined, - capacity, typeChangeInfos) - val iter = new RecordReaderIterator(vectorizedReader) - // SPARK-23457 Register a task completion listener before `initialization`. - taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) - vectorizedReader.initialize(split, hadoopAttemptContext) - logDebug(s"Appending $partitionSchema ${file.partitionValues}") - vectorizedReader.initBatch(partitionSchema, file.partitionValues) - if (returningBatch) { - vectorizedReader.enableReturningBatches() - } - - // UnsafeRowParquetRecordReader appends the columns internally to avoid another copy. - iter.asInstanceOf[Iterator[InternalRow]] - } else { - logDebug(s"Falling back to parquet-mr") - // ParquetRecordReader returns InternalRow - val readSupport = new ParquetReadSupport( - convertTz, - enableVectorizedReader = false, - datetimeRebaseMode, - int96RebaseMode) - val reader = if (pushed.isDefined && enableRecordFilter) { - val parquetFilter = FilterCompat.get(pushed.get, null) - new ParquetRecordReader[InternalRow](readSupport, parquetFilter) - } else { - new ParquetRecordReader[InternalRow](readSupport) - } - val iter = new RecordReaderIterator[InternalRow](reader) - // SPARK-23457 Register a task completion listener before `initialization`. - taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) - reader.initialize(split, hadoopAttemptContext) - - val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes - val unsafeProjection = if (typeChangeInfos.isEmpty) { - GenerateUnsafeProjection.generate(fullSchema, fullSchema) - } else { - // find type changed. - val newFullSchema = new StructType(requiredSchema.fields.zipWithIndex.map { case (f, i) => - if (typeChangeInfos.containsKey(i)) { - StructField(f.name, typeChangeInfos.get(i).getRight, f.nullable, f.metadata) - } else f - }).toAttributes ++ partitionSchema.toAttributes - val castSchema = newFullSchema.zipWithIndex.map { case (attr, i) => - if (typeChangeInfos.containsKey(i)) { - Cast(attr, typeChangeInfos.get(i).getLeft) - } else attr - } - GenerateUnsafeProjection.generate(castSchema, newFullSchema) - } - - if (partitionSchema.length == 0) { - // There is no partition columns - iter.map(unsafeProjection) - } else { - val joinedRow = new JoinedRow() - iter.map(d => unsafeProjection(joinedRow(d, file.partitionValues))) - } - } - } - } - } -} - -object Spark312HoodieParquetFileFormat { - - val PARQUET_FILTERS_CLASS_NAME = "org.apache.spark.sql.execution.datasources.parquet.ParquetFilters" - - private def createParquetFilters(arg: Any*): ParquetFilters = { - val clazz = Class.forName(PARQUET_FILTERS_CLASS_NAME, true, Thread.currentThread().getContextClassLoader) - val ctor = clazz.getConstructors.head - ctor.newInstance(arg.map(_.asInstanceOf[AnyRef]): _*).asInstanceOf[ParquetFilters] - } - - private def rebuildFilterFromParquet(oldFilter: Filter, fileSchema: InternalSchema, querySchema: InternalSchema): Filter = { - if (fileSchema == null || querySchema == null) { - oldFilter - } else { - oldFilter match { - case eq: EqualTo => - val newAttribute = InternalSchemaUtils.reBuildFilterName(eq.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else eq.copy(attribute = newAttribute) - case eqs: EqualNullSafe => - val newAttribute = InternalSchemaUtils.reBuildFilterName(eqs.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else eqs.copy(attribute = newAttribute) - case gt: GreaterThan => - val newAttribute = InternalSchemaUtils.reBuildFilterName(gt.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else gt.copy(attribute = newAttribute) - case gtr: GreaterThanOrEqual => - val newAttribute = InternalSchemaUtils.reBuildFilterName(gtr.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else gtr.copy(attribute = newAttribute) - case lt: LessThan => - val newAttribute = InternalSchemaUtils.reBuildFilterName(lt.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else lt.copy(attribute = newAttribute) - case lte: LessThanOrEqual => - val newAttribute = InternalSchemaUtils.reBuildFilterName(lte.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else lte.copy(attribute = newAttribute) - case i: In => - val newAttribute = InternalSchemaUtils.reBuildFilterName(i.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else i.copy(attribute = newAttribute) - case isn: IsNull => - val newAttribute = InternalSchemaUtils.reBuildFilterName(isn.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else isn.copy(attribute = newAttribute) - case isnn: IsNotNull => - val newAttribute = InternalSchemaUtils.reBuildFilterName(isnn.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else isnn.copy(attribute = newAttribute) - case And(left, right) => - And(rebuildFilterFromParquet(left, fileSchema, querySchema), rebuildFilterFromParquet(right, fileSchema, querySchema)) - case Or(left, right) => - Or(rebuildFilterFromParquet(left, fileSchema, querySchema), rebuildFilterFromParquet(right, fileSchema, querySchema)) - case Not(child) => - Not(rebuildFilterFromParquet(child, fileSchema, querySchema)) - case ssw: StringStartsWith => - val newAttribute = InternalSchemaUtils.reBuildFilterName(ssw.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else ssw.copy(attribute = newAttribute) - case ses: StringEndsWith => - val newAttribute = InternalSchemaUtils.reBuildFilterName(ses.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else ses.copy(attribute = newAttribute) - case sc: StringContains => - val newAttribute = InternalSchemaUtils.reBuildFilterName(sc.attribute, fileSchema, querySchema) - if (newAttribute.isEmpty) AlwaysTrue else sc.copy(attribute = newAttribute) - case AlwaysTrue => - AlwaysTrue - case AlwaysFalse => - AlwaysFalse - case _ => - AlwaysTrue - } - } - } -} diff --git a/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark31HoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark31HoodieParquetFileFormat.scala new file mode 100644 index 0000000000000..e99850bef06b8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark31HoodieParquetFileFormat.scala @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl +import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType} +import org.apache.hudi.HoodieSparkUtils +import org.apache.hudi.client.utils.SparkInternalSchemaConverter +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.util.StringUtils.isNullOrEmpty +import org.apache.hudi.common.util.{InternalSchemaCache, ReflectionUtils, StringUtils} +import org.apache.hudi.common.util.collection.Pair +import org.apache.hudi.internal.schema.InternalSchema +import org.apache.hudi.internal.schema.action.InternalSchemaMerger +import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper} +import org.apache.parquet.filter2.compat.FilterCompat +import org.apache.parquet.filter2.predicate.FilterApi +import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS +import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat, ParquetRecordReader} +import org.apache.spark.TaskContext +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.avro.AvroDeserializer +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow} +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.execution.datasources.parquet.Spark31HoodieParquetFileFormat.{createParquetFilters, pruneInternalSchema, rebuildFilterFromParquet} +import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources._ +import org.apache.spark.sql.types.{AtomicType, DataType, StructField, StructType} +import org.apache.spark.util.SerializableConfiguration + +import java.net.URI + + +/** + * This class is an extension of [[ParquetFileFormat]] overriding Spark-specific behavior + * that's not possible to customize in any other way + * + * NOTE: This is a version of [[AvroDeserializer]] impl from Spark 3.1.2 w/ w/ the following changes applied to it: + *
    + *
  1. Avoiding appending partition values to the rows read from the data file
  2. + *
  3. Schema on-read
  4. + *
+ */ +class Spark31HoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean) extends ParquetFileFormat { + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + hadoopConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName) + hadoopConf.set( + ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, + requiredSchema.json) + hadoopConf.set( + ParquetWriteSupport.SPARK_ROW_SCHEMA, + requiredSchema.json) + hadoopConf.set( + SQLConf.SESSION_LOCAL_TIMEZONE.key, + sparkSession.sessionState.conf.sessionLocalTimeZone) + hadoopConf.setBoolean( + SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key, + sparkSession.sessionState.conf.nestedSchemaPruningEnabled) + hadoopConf.setBoolean( + SQLConf.CASE_SENSITIVE.key, + sparkSession.sessionState.conf.caseSensitiveAnalysis) + + ParquetWriteSupport.setSchema(requiredSchema, hadoopConf) + + // Sets flags for `ParquetToSparkSchemaConverter` + hadoopConf.setBoolean( + SQLConf.PARQUET_BINARY_AS_STRING.key, + sparkSession.sessionState.conf.isParquetBinaryAsString) + hadoopConf.setBoolean( + SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, + sparkSession.sessionState.conf.isParquetINT96AsTimestamp) + + val internalSchemaStr = hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) + // For Spark DataSource v1, there's no Physical Plan projection/schema pruning w/in Spark itself, + // therefore it's safe to do schema projection here + if (!isNullOrEmpty(internalSchemaStr)) { + val prunedInternalSchemaStr = + pruneInternalSchema(internalSchemaStr, requiredSchema) + hadoopConf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, prunedInternalSchemaStr) + } + + val broadcastedHadoopConf = + sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) + + // TODO: if you move this into the closure it reverts to the default values. + // If true, enable using the custom RecordReader for parquet. This only works for + // a subset of the types (no complex types). + val resultSchema = StructType(partitionSchema.fields ++ requiredSchema.fields) + val sqlConf = sparkSession.sessionState.conf + val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled + val enableVectorizedReader: Boolean = + sqlConf.parquetVectorizedReaderEnabled && + resultSchema.forall(_.dataType.isInstanceOf[AtomicType]) + val enableRecordFilter: Boolean = sqlConf.parquetRecordFilterEnabled + val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion + val capacity = sqlConf.parquetVectorizedReaderBatchSize + val enableParquetFilterPushDown: Boolean = sqlConf.parquetFilterPushDown + // Whole stage codegen (PhysicalRDD) is able to deal with batches directly + val returningBatch = supportBatch(sparkSession, resultSchema) + val pushDownDate = sqlConf.parquetFilterPushDownDate + val pushDownTimestamp = sqlConf.parquetFilterPushDownTimestamp + val pushDownDecimal = sqlConf.parquetFilterPushDownDecimal + val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith + val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold + val isCaseSensitive = sqlConf.caseSensitiveAnalysis + + (file: PartitionedFile) => { + assert(!shouldAppendPartitionValues || file.partitionValues.numFields == partitionSchema.size) + + val filePath = new Path(new URI(file.filePath)) + val split = + new org.apache.parquet.hadoop.ParquetInputSplit( + filePath, + file.start, + file.start + file.length, + file.length, + Array.empty, + null) + + val sharedConf = broadcastedHadoopConf.value.value + + // Fetch internal schema + val internalSchemaStr = sharedConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) + // Internal schema has to be pruned at this point + val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr) + + val shouldUseInternalSchema = !isNullOrEmpty(internalSchemaStr) && querySchemaOption.isPresent + + val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH) + val fileSchema = if (shouldUseInternalSchema) { + val commitInstantTime = FSUtils.getCommitTime(filePath.getName).toLong; + val validCommits = sharedConf.get(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST) + InternalSchemaCache.getInternalSchemaByVersionId(commitInstantTime, tablePath, sharedConf, if (validCommits == null) "" else validCommits) + } else { + null + } + + lazy val footerFileMetaData = + ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData + val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode( + footerFileMetaData.getKeyValueMetaData.get, + SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ)) + // Try to push down filters when filter push-down is enabled. + val pushed = if (enableParquetFilterPushDown) { + val parquetSchema = footerFileMetaData.getSchema + val parquetFilters = if (HoodieSparkUtils.gteqSpark3_1_3) { + createParquetFilters( + parquetSchema, + pushDownDate, + pushDownTimestamp, + pushDownDecimal, + pushDownStringStartWith, + pushDownInFilterThreshold, + isCaseSensitive, + datetimeRebaseMode) + } else { + createParquetFilters( + parquetSchema, + pushDownDate, + pushDownTimestamp, + pushDownDecimal, + pushDownStringStartWith, + pushDownInFilterThreshold, + isCaseSensitive) + } + filters.map(rebuildFilterFromParquet(_, fileSchema, querySchemaOption.orElse(null))) + // Collects all converted Parquet filter predicates. Notice that not all predicates can be + // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap` + // is used here. + .flatMap(parquetFilters.createFilter) + .reduceOption(FilterApi.and) + } else { + None + } + + // PARQUET_INT96_TIMESTAMP_CONVERSION says to apply timezone conversions to int96 timestamps' + // *only* if the file was created by something other than "parquet-mr", so check the actual + // writer here for this file. We have to do this per-file, as each file in the table may + // have different writers. + // Define isCreatedByParquetMr as function to avoid unnecessary parquet footer reads. + def isCreatedByParquetMr: Boolean = + footerFileMetaData.getCreatedBy().startsWith("parquet-mr") + + val convertTz = + if (timestampConversion && !isCreatedByParquetMr) { + Some(DateTimeUtils.getZoneId(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) + } else { + None + } + + val int96RebaseMode = DataSourceUtils.int96RebaseMode( + footerFileMetaData.getKeyValueMetaData.get, + SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_READ)) + + val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) + + // Clone new conf + val hadoopAttemptConf = new Configuration(broadcastedHadoopConf.value.value) + var typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = if (shouldUseInternalSchema) { + val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema() + val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema) + + hadoopAttemptConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json) + + SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema) + } else { + new java.util.HashMap() + } + + val hadoopAttemptContext = + new TaskAttemptContextImpl(hadoopAttemptConf, attemptId) + + // Try to push down filters when filter push-down is enabled. + // Notice: This push-down is RowGroups level, not individual records. + if (pushed.isDefined) { + ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get) + } + val taskContext = Option(TaskContext.get()) + if (enableVectorizedReader) { + val vectorizedReader = + if (shouldUseInternalSchema) { + new Spark312HoodieVectorizedParquetRecordReader( + convertTz.orNull, + datetimeRebaseMode.toString, + int96RebaseMode.toString, + enableOffHeapColumnVector && taskContext.isDefined, + capacity, + typeChangeInfos) + } else { + new VectorizedParquetRecordReader( + convertTz.orNull, + datetimeRebaseMode.toString, + int96RebaseMode.toString, + enableOffHeapColumnVector && taskContext.isDefined, + capacity) + } + + val iter = new RecordReaderIterator(vectorizedReader) + // SPARK-23457 Register a task completion listener before `initialization`. + taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) + vectorizedReader.initialize(split, hadoopAttemptContext) + + // NOTE: We're making appending of the partitioned values to the rows read from the + // data file configurable + if (shouldAppendPartitionValues) { + logDebug(s"Appending $partitionSchema ${file.partitionValues}") + vectorizedReader.initBatch(partitionSchema, file.partitionValues) + } else { + vectorizedReader.initBatch(StructType(Nil), InternalRow.empty) + } + + if (returningBatch) { + vectorizedReader.enableReturningBatches() + } + + // UnsafeRowParquetRecordReader appends the columns internally to avoid another copy. + iter.asInstanceOf[Iterator[InternalRow]] + } else { + logDebug(s"Falling back to parquet-mr") + // ParquetRecordReader returns InternalRow + val readSupport = new ParquetReadSupport( + convertTz, + enableVectorizedReader = false, + datetimeRebaseMode, + int96RebaseMode) + val reader = if (pushed.isDefined && enableRecordFilter) { + val parquetFilter = FilterCompat.get(pushed.get, null) + new ParquetRecordReader[InternalRow](readSupport, parquetFilter) + } else { + new ParquetRecordReader[InternalRow](readSupport) + } + val iter = new RecordReaderIterator[InternalRow](reader) + // SPARK-23457 Register a task completion listener before `initialization`. + taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) + reader.initialize(split, hadoopAttemptContext) + + val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes + val unsafeProjection = if (typeChangeInfos.isEmpty) { + GenerateUnsafeProjection.generate(fullSchema, fullSchema) + } else { + // find type changed. + val newFullSchema = new StructType(requiredSchema.fields.zipWithIndex.map { case (f, i) => + if (typeChangeInfos.containsKey(i)) { + StructField(f.name, typeChangeInfos.get(i).getRight, f.nullable, f.metadata) + } else f + }).toAttributes ++ partitionSchema.toAttributes + val castSchema = newFullSchema.zipWithIndex.map { case (attr, i) => + if (typeChangeInfos.containsKey(i)) { + Cast(attr, typeChangeInfos.get(i).getLeft) + } else attr + } + GenerateUnsafeProjection.generate(castSchema, newFullSchema) + } + + // NOTE: We're making appending of the partitioned values to the rows read from the + // data file configurable + if (!shouldAppendPartitionValues || partitionSchema.length == 0) { + // There is no partition columns + iter.map(unsafeProjection) + } else { + val joinedRow = new JoinedRow() + iter.map(d => unsafeProjection(joinedRow(d, file.partitionValues))) + } + } + } + } +} + +object Spark31HoodieParquetFileFormat { + + def pruneInternalSchema(internalSchemaStr: String, requiredSchema: StructType): String = { + val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr) + if (querySchemaOption.isPresent && requiredSchema.nonEmpty) { + val prunedSchema = SparkInternalSchemaConverter.convertAndPruneStructTypeToInternalSchema(requiredSchema, querySchemaOption.get()) + SerDeHelper.toJson(prunedSchema) + } else { + internalSchemaStr + } + } + + private def createParquetFilters(args: Any*): ParquetFilters = { + // ParquetFilters bears a single ctor (in Spark 3.1) + val ctor = classOf[ParquetFilters].getConstructors.head + ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*) + .asInstanceOf[ParquetFilters] + } + + private def rebuildFilterFromParquet(oldFilter: Filter, fileSchema: InternalSchema, querySchema: InternalSchema): Filter = { + if (fileSchema == null || querySchema == null) { + oldFilter + } else { + oldFilter match { + case eq: EqualTo => + val newAttribute = InternalSchemaUtils.reBuildFilterName(eq.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else eq.copy(attribute = newAttribute) + case eqs: EqualNullSafe => + val newAttribute = InternalSchemaUtils.reBuildFilterName(eqs.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else eqs.copy(attribute = newAttribute) + case gt: GreaterThan => + val newAttribute = InternalSchemaUtils.reBuildFilterName(gt.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else gt.copy(attribute = newAttribute) + case gtr: GreaterThanOrEqual => + val newAttribute = InternalSchemaUtils.reBuildFilterName(gtr.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else gtr.copy(attribute = newAttribute) + case lt: LessThan => + val newAttribute = InternalSchemaUtils.reBuildFilterName(lt.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else lt.copy(attribute = newAttribute) + case lte: LessThanOrEqual => + val newAttribute = InternalSchemaUtils.reBuildFilterName(lte.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else lte.copy(attribute = newAttribute) + case i: In => + val newAttribute = InternalSchemaUtils.reBuildFilterName(i.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else i.copy(attribute = newAttribute) + case isn: IsNull => + val newAttribute = InternalSchemaUtils.reBuildFilterName(isn.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else isn.copy(attribute = newAttribute) + case isnn: IsNotNull => + val newAttribute = InternalSchemaUtils.reBuildFilterName(isnn.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else isnn.copy(attribute = newAttribute) + case And(left, right) => + And(rebuildFilterFromParquet(left, fileSchema, querySchema), rebuildFilterFromParquet(right, fileSchema, querySchema)) + case Or(left, right) => + Or(rebuildFilterFromParquet(left, fileSchema, querySchema), rebuildFilterFromParquet(right, fileSchema, querySchema)) + case Not(child) => + Not(rebuildFilterFromParquet(child, fileSchema, querySchema)) + case ssw: StringStartsWith => + val newAttribute = InternalSchemaUtils.reBuildFilterName(ssw.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else ssw.copy(attribute = newAttribute) + case ses: StringEndsWith => + val newAttribute = InternalSchemaUtils.reBuildFilterName(ses.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else ses.copy(attribute = newAttribute) + case sc: StringContains => + val newAttribute = InternalSchemaUtils.reBuildFilterName(sc.attribute, fileSchema, querySchema) + if (newAttribute.isEmpty) AlwaysTrue else sc.copy(attribute = newAttribute) + case AlwaysTrue => + AlwaysTrue + case AlwaysFalse => + AlwaysFalse + case _ => + AlwaysTrue + } + } + } +} diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala index bad392b4f97ac..15624c741130c 100644 --- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/adapter/Spark3_2Adapter.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.SPARK_VERSION import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, Spark32HoodieParquetFileFormat} import org.apache.spark.sql.parser.HoodieSpark3_2ExtendedSqlParser import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{HoodieCatalystExpressionUtils, HoodieSpark3_2CatalystExpressionUtils, SparkSession} @@ -80,14 +80,7 @@ class Spark3_2Adapter extends BaseSpark3Adapter { } } - override def createHoodieParquetFileFormat(): Option[ParquetFileFormat] = { - if (SPARK_VERSION.startsWith("3.2")) { - val loadClassName = "org.apache.spark.sql.execution.datasources.parquet.Spark32HoodieParquetFileFormat" - val clazz = Class.forName(loadClassName, true, Thread.currentThread().getContextClassLoader) - val ctor = clazz.getConstructors.head - Some(ctor.newInstance().asInstanceOf[ParquetFileFormat]) - } else { - None - } + override def createHoodieParquetFileFormat(appendPartitionValues: Boolean): Option[ParquetFileFormat] = { + Some(new Spark32HoodieParquetFileFormat(appendPartitionValues)) } } diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32DataSourceUtils.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32DataSourceUtils.scala new file mode 100644 index 0000000000000..6d1c76380f216 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32DataSourceUtils.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import org.apache.spark.sql.SPARK_VERSION_METADATA_KEY +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy +import org.apache.spark.util.Utils + +object Spark32DataSourceUtils { + + /** + * NOTE: This method was copied from Spark 3.2.0, and is required to maintain runtime + * compatibility against Spark 3.2.0 + */ + // scalastyle:off + def int96RebaseMode(lookupFileMeta: String => String, + modeByConfig: String): LegacyBehaviorPolicy.Value = { + if (Utils.isTesting && SQLConf.get.getConfString("spark.test.forceNoRebase", "") == "true") { + return LegacyBehaviorPolicy.CORRECTED + } + // If there is no version, we return the mode specified by the config. + Option(lookupFileMeta(SPARK_VERSION_METADATA_KEY)).map { version => + // Files written by Spark 3.0 and earlier follow the legacy hybrid calendar and we need to + // rebase the INT96 timestamp values. + // Files written by Spark 3.1 and latter may also need the rebase if they were written with + // the "LEGACY" rebase mode. + if (version < "3.1.0" || lookupFileMeta("org.apache.spark.legacyINT96") != null) { + LegacyBehaviorPolicy.LEGACY + } else { + LegacyBehaviorPolicy.CORRECTED + } + }.getOrElse(LegacyBehaviorPolicy.withName(modeByConfig)) + } + // scalastyle:on + + /** + * NOTE: This method was copied from Spark 3.2.0, and is required to maintain runtime + * compatibility against Spark 3.2.0 + */ + // scalastyle:off + def datetimeRebaseMode(lookupFileMeta: String => String, + modeByConfig: String): LegacyBehaviorPolicy.Value = { + if (Utils.isTesting && SQLConf.get.getConfString("spark.test.forceNoRebase", "") == "true") { + return LegacyBehaviorPolicy.CORRECTED + } + // If there is no version, we return the mode specified by the config. + Option(lookupFileMeta(SPARK_VERSION_METADATA_KEY)).map { version => + // Files written by Spark 2.4 and earlier follow the legacy hybrid calendar and we need to + // rebase the datetime values. + // Files written by Spark 3.0 and latter may also need the rebase if they were written with + // the "LEGACY" rebase mode. + if (version < "3.0.0" || lookupFileMeta("org.apache.spark.legacyDateTime") != null) { + LegacyBehaviorPolicy.LEGACY + } else { + LegacyBehaviorPolicy.CORRECTED + } + }.getOrElse(LegacyBehaviorPolicy.withName(modeByConfig)) + } + // scalastyle:on + +} diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32HoodieParquetFileFormat.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32HoodieParquetFileFormat.scala index 28db4739656e7..7135f19e95e2d 100644 --- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32HoodieParquetFileFormat.scala +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32HoodieParquetFileFormat.scala @@ -17,16 +17,16 @@ package org.apache.spark.sql.execution.datasources.parquet -import java.net.URI - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType} +import org.apache.hudi.HoodieSparkUtils import org.apache.hudi.client.utils.SparkInternalSchemaConverter import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.util.InternalSchemaCache +import org.apache.hudi.common.util.StringUtils.isNullOrEmpty import org.apache.hudi.common.util.collection.Pair import org.apache.hudi.internal.schema.InternalSchema import org.apache.hudi.internal.schema.action.InternalSchemaMerger @@ -34,124 +34,141 @@ import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper} import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.filter2.predicate.FilterApi import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS -import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat, ParquetRecordReader} +import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetRecordReader} import org.apache.spark.TaskContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow} import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.execution.datasources.parquet.Spark32HoodieParquetFileFormat._ import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{AtomicType, DataType, StructField, StructType} import org.apache.spark.util.SerializableConfiguration -class Spark32HoodieParquetFileFormat extends ParquetFileFormat { - - // reference ParquetFileFormat from spark project - override def buildReaderWithPartitionValues( - sparkSession: SparkSession, - dataSchema: StructType, - partitionSchema: StructType, - requiredSchema: StructType, - filters: Seq[Filter], - options: Map[String, String], - hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { - if (hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, "").isEmpty) { - // fallback to origin parquet File read - super.buildReaderWithPartitionValues(sparkSession, dataSchema, partitionSchema, requiredSchema, filters, options, hadoopConf) - } else { - hadoopConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName) - hadoopConf.set( - ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, - requiredSchema.json) - hadoopConf.set( - ParquetWriteSupport.SPARK_ROW_SCHEMA, - requiredSchema.json) - hadoopConf.set( - SQLConf.SESSION_LOCAL_TIMEZONE.key, - sparkSession.sessionState.conf.sessionLocalTimeZone) - hadoopConf.setBoolean( - SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key, - sparkSession.sessionState.conf.nestedSchemaPruningEnabled) - hadoopConf.setBoolean( - SQLConf.CASE_SENSITIVE.key, - sparkSession.sessionState.conf.caseSensitiveAnalysis) - - ParquetWriteSupport.setSchema(requiredSchema, hadoopConf) - - // Sets flags for `ParquetToSparkSchemaConverter` - hadoopConf.setBoolean( - SQLConf.PARQUET_BINARY_AS_STRING.key, - sparkSession.sessionState.conf.isParquetBinaryAsString) - hadoopConf.setBoolean( - SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, - sparkSession.sessionState.conf.isParquetINT96AsTimestamp) - // for dataSource v1, we have no method to do project for spark physical plan. - // it's safe to do cols project here. - val internalSchemaString = hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) - val querySchemaOption = SerDeHelper.fromJson(internalSchemaString) - if (querySchemaOption.isPresent && !requiredSchema.isEmpty) { - val prunedSchema = SparkInternalSchemaConverter.convertAndPruneStructTypeToInternalSchema(requiredSchema, querySchemaOption.get()) - hadoopConf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, SerDeHelper.toJson(prunedSchema)) - } - val broadcastedHadoopConf = - sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) - - // TODO: if you move this into the closure it reverts to the default values. - // If true, enable using the custom RecordReader for parquet. This only works for - // a subset of the types (no complex types). - val resultSchema = StructType(partitionSchema.fields ++ requiredSchema.fields) - val sqlConf = sparkSession.sessionState.conf - val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled - val enableVectorizedReader: Boolean = - sqlConf.parquetVectorizedReaderEnabled && - resultSchema.forall(_.dataType.isInstanceOf[AtomicType]) - val enableRecordFilter: Boolean = sqlConf.parquetRecordFilterEnabled - val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion - val capacity = sqlConf.parquetVectorizedReaderBatchSize - val enableParquetFilterPushDown: Boolean = sqlConf.parquetFilterPushDown - // Whole stage codegen (PhysicalRDD) is able to deal with batches directly - val returningBatch = supportBatch(sparkSession, resultSchema) - val pushDownDate = sqlConf.parquetFilterPushDownDate - val pushDownTimestamp = sqlConf.parquetFilterPushDownTimestamp - val pushDownDecimal = sqlConf.parquetFilterPushDownDecimal - val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith - val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold - val isCaseSensitive = sqlConf.caseSensitiveAnalysis - val parquetOptions = new ParquetOptions(options, sparkSession.sessionState.conf) - val datetimeRebaseModeInRead = parquetOptions.datetimeRebaseModeInRead - val int96RebaseModeInread = parquetOptions.int96RebaseModeInRead - - (file: PartitionedFile) => { - assert(file.partitionValues.numFields == partitionSchema.size) - val filePath = new Path(new URI(file.filePath)) - val split = new FileSplit(filePath, file.start, file.length, Array.empty[String]) - val sharedConf = broadcastedHadoopConf.value.value - // do deal with internalSchema - val internalSchemaString = sharedConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) - // querySchema must be a pruned schema. - val querySchemaOption = SerDeHelper.fromJson(internalSchemaString) - val internalSchemaChangeEnabled = if (internalSchemaString.isEmpty || !querySchemaOption.isPresent) false else true - val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH) +import java.net.URI + +/** + * This class is an extension of [[ParquetFileFormat]] overriding Spark-specific behavior + * that's not possible to customize in any other way + * + * NOTE: This is a version of [[AvroDeserializer]] impl from Spark 3.2.1 w/ w/ the following changes applied to it: + *
    + *
  1. Avoiding appending partition values to the rows read from the data file
  2. + *
  3. Schema on-read
  4. + *
+ */ +class Spark32HoodieParquetFileFormat(private val shouldAppendPartitionValues: Boolean) extends ParquetFileFormat { + + override def buildReaderWithPartitionValues(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + hadoopConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName) + hadoopConf.set( + ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, + requiredSchema.json) + hadoopConf.set( + ParquetWriteSupport.SPARK_ROW_SCHEMA, + requiredSchema.json) + hadoopConf.set( + SQLConf.SESSION_LOCAL_TIMEZONE.key, + sparkSession.sessionState.conf.sessionLocalTimeZone) + hadoopConf.setBoolean( + SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key, + sparkSession.sessionState.conf.nestedSchemaPruningEnabled) + hadoopConf.setBoolean( + SQLConf.CASE_SENSITIVE.key, + sparkSession.sessionState.conf.caseSensitiveAnalysis) + + ParquetWriteSupport.setSchema(requiredSchema, hadoopConf) + + // Sets flags for `ParquetToSparkSchemaConverter` + hadoopConf.setBoolean( + SQLConf.PARQUET_BINARY_AS_STRING.key, + sparkSession.sessionState.conf.isParquetBinaryAsString) + hadoopConf.setBoolean( + SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, + sparkSession.sessionState.conf.isParquetINT96AsTimestamp) + + val internalSchemaStr = hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) + // For Spark DataSource v1, there's no Physical Plan projection/schema pruning w/in Spark itself, + // therefore it's safe to do schema projection here + if (!isNullOrEmpty(internalSchemaStr)) { + val prunedInternalSchemaStr = + pruneInternalSchema(internalSchemaStr, requiredSchema) + hadoopConf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, prunedInternalSchemaStr) + } + + val broadcastedHadoopConf = + sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) + + // TODO: if you move this into the closure it reverts to the default values. + // If true, enable using the custom RecordReader for parquet. This only works for + // a subset of the types (no complex types). + val resultSchema = StructType(partitionSchema.fields ++ requiredSchema.fields) + val sqlConf = sparkSession.sessionState.conf + val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled + val enableVectorizedReader: Boolean = + sqlConf.parquetVectorizedReaderEnabled && + resultSchema.forall(_.dataType.isInstanceOf[AtomicType]) + val enableRecordFilter: Boolean = sqlConf.parquetRecordFilterEnabled + val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion + val capacity = sqlConf.parquetVectorizedReaderBatchSize + val enableParquetFilterPushDown: Boolean = sqlConf.parquetFilterPushDown + // Whole stage codegen (PhysicalRDD) is able to deal with batches directly + val returningBatch = supportBatch(sparkSession, resultSchema) + val pushDownDate = sqlConf.parquetFilterPushDownDate + val pushDownTimestamp = sqlConf.parquetFilterPushDownTimestamp + val pushDownDecimal = sqlConf.parquetFilterPushDownDecimal + val pushDownStringStartWith = sqlConf.parquetFilterPushDownStringStartWith + val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold + val isCaseSensitive = sqlConf.caseSensitiveAnalysis + val parquetOptions = new ParquetOptions(options, sparkSession.sessionState.conf) + val datetimeRebaseModeInRead = parquetOptions.datetimeRebaseModeInRead + val int96RebaseModeInRead = parquetOptions.int96RebaseModeInRead + + (file: PartitionedFile) => { + assert(!shouldAppendPartitionValues || file.partitionValues.numFields == partitionSchema.size) + + val filePath = new Path(new URI(file.filePath)) + val split = new FileSplit(filePath, file.start, file.length, Array.empty[String]) + + val sharedConf = broadcastedHadoopConf.value.value + + // Fetch internal schema + val internalSchemaStr = sharedConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA) + // Internal schema has to be pruned at this point + val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr) + + val shouldUseInternalSchema = !isNullOrEmpty(internalSchemaStr) && querySchemaOption.isPresent + + val tablePath = sharedConf.get(SparkInternalSchemaConverter.HOODIE_TABLE_PATH) + val fileSchema = if (shouldUseInternalSchema) { val commitInstantTime = FSUtils.getCommitTime(filePath.getName).toLong; - val fileSchema = if (internalSchemaChangeEnabled) { - val validCommits = sharedConf.get(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST) - InternalSchemaCache.getInternalSchemaByVersionId(commitInstantTime, tablePath, sharedConf, if (validCommits == null) "" else validCommits) - } else { - // this should not happened, searchSchemaAndCache will deal with correctly. - null - } + val validCommits = sharedConf.get(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST) + InternalSchemaCache.getInternalSchemaByVersionId(commitInstantTime, tablePath, sharedConf, if (validCommits == null) "" else validCommits) + } else { + null + } - lazy val footerFileMetaData = - ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData - val datetimeRebaseSpec = DataSourceUtils.datetimeRebaseSpec( - footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) - // Try to push down filters when filter push-down is enabled. - val pushed = if (enableParquetFilterPushDown) { - val parquetSchema = footerFileMetaData.getSchema - val parquetFilters = new ParquetFilters( + lazy val footerFileMetaData = + ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData + // Try to push down filters when filter push-down is enabled. + val pushed = if (enableParquetFilterPushDown) { + val parquetSchema = footerFileMetaData.getSchema + val parquetFilters = if (HoodieSparkUtils.gteqSpark3_2_1) { + // NOTE: Below code could only be compiled against >= Spark 3.2.1, + // and unfortunately won't compile against Spark 3.2.0 + // However this code is runtime-compatible w/ both Spark 3.2.0 and >= Spark 3.2.1 + val datetimeRebaseSpec = + DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) + new ParquetFilters( parquetSchema, pushDownDate, pushDownTimestamp, @@ -160,100 +177,184 @@ class Spark32HoodieParquetFileFormat extends ParquetFileFormat { pushDownInFilterThreshold, isCaseSensitive, datetimeRebaseSpec) - filters.map(Spark32HoodieParquetFileFormat.rebuildFilterFromParquet(_, fileSchema, querySchemaOption.get())) - // Collects all converted Parquet filter predicates. Notice that not all predicates can be - // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap` - // is used here. - .flatMap(parquetFilters.createFilter(_)) - .reduceOption(FilterApi.and) + } else { + // Spark 3.2.0 + val datetimeRebaseMode = + Spark32DataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) + createParquetFilters( + parquetSchema, + pushDownDate, + pushDownTimestamp, + pushDownDecimal, + pushDownStringStartWith, + pushDownInFilterThreshold, + isCaseSensitive, + datetimeRebaseMode) + } + filters.map(rebuildFilterFromParquet(_, fileSchema, querySchemaOption.orElse(null))) + // Collects all converted Parquet filter predicates. Notice that not all predicates can be + // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap` + // is used here. + .flatMap(parquetFilters.createFilter) + .reduceOption(FilterApi.and) + } else { + None + } + + // PARQUET_INT96_TIMESTAMP_CONVERSION says to apply timezone conversions to int96 timestamps' + // *only* if the file was created by something other than "parquet-mr", so check the actual + // writer here for this file. We have to do this per-file, as each file in the table may + // have different writers. + // Define isCreatedByParquetMr as function to avoid unnecessary parquet footer reads. + def isCreatedByParquetMr: Boolean = + footerFileMetaData.getCreatedBy().startsWith("parquet-mr") + + val convertTz = + if (timestampConversion && !isCreatedByParquetMr) { + Some(DateTimeUtils.getZoneId(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) } else { None } - // PARQUET_INT96_TIMESTAMP_CONVERSION says to apply timezone conversions to int96 timestamps' - // *only* if the file was created by something other than "parquet-mr", so check the actual - // writer here for this file. We have to do this per-file, as each file in the table may - // have different writers. - // Define isCreatedByParquetMr as function to avoid unnecessary parquet footer reads. - def isCreatedByParquetMr: Boolean = - footerFileMetaData.getCreatedBy().startsWith("parquet-mr") - - val convertTz = - if (timestampConversion && !isCreatedByParquetMr) { - Some(DateTimeUtils.getZoneId(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) + val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) + + // Clone new conf + val hadoopAttemptConf = new Configuration(broadcastedHadoopConf.value.value) + val typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = if (shouldUseInternalSchema) { + val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema() + val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema) + + hadoopAttemptConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json) + + SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema) + } else { + new java.util.HashMap() + } + + val hadoopAttemptContext = + new TaskAttemptContextImpl(hadoopAttemptConf, attemptId) + + // Try to push down filters when filter push-down is enabled. + // Notice: This push-down is RowGroups level, not individual records. + if (pushed.isDefined) { + ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get) + } + val taskContext = Option(TaskContext.get()) + if (enableVectorizedReader) { + val vectorizedReader = + if (shouldUseInternalSchema) { + val int96RebaseSpec = + DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) + val datetimeRebaseSpec = + DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) + new Spark32HoodieVectorizedParquetRecordReader( + convertTz.orNull, + datetimeRebaseSpec.mode.toString, + datetimeRebaseSpec.timeZone, + int96RebaseSpec.mode.toString, + int96RebaseSpec.timeZone, + enableOffHeapColumnVector && taskContext.isDefined, + capacity, + typeChangeInfos) + } else if (HoodieSparkUtils.gteqSpark3_2_1) { + // NOTE: Below code could only be compiled against >= Spark 3.2.1, + // and unfortunately won't compile against Spark 3.2.0 + // However this code is runtime-compatible w/ both Spark 3.2.0 and >= Spark 3.2.1 + val int96RebaseSpec = + DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) + val datetimeRebaseSpec = + DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) + new VectorizedParquetRecordReader( + convertTz.orNull, + datetimeRebaseSpec.mode.toString, + datetimeRebaseSpec.timeZone, + int96RebaseSpec.mode.toString, + int96RebaseSpec.timeZone, + enableOffHeapColumnVector && taskContext.isDefined, + capacity) } else { - None + // Spark 3.2.0 + val datetimeRebaseMode = + Spark32DataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) + val int96RebaseMode = + Spark32DataSourceUtils.int96RebaseMode(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) + createVectorizedParquetRecordReader( + convertTz.orNull, + datetimeRebaseMode.toString, + int96RebaseMode.toString, + enableOffHeapColumnVector && taskContext.isDefined, + capacity) } - val int96RebaseSpec = DataSourceUtils.int96RebaseSpec( - footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInread) - val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) - // use new conf - val hadoopAttempConf = new Configuration(broadcastedHadoopConf.value.value) + // SPARK-37089: We cannot register a task completion listener to close this iterator here + // because downstream exec nodes have already registered their listeners. Since listeners + // are executed in reverse order of registration, a listener registered here would close the + // iterator while downstream exec nodes are still running. When off-heap column vectors are + // enabled, this can cause a use-after-free bug leading to a segfault. // - // reset request schema - var typeChangeInfos: java.util.Map[Integer, Pair[DataType, DataType]] = new java.util.HashMap() - if (internalSchemaChangeEnabled) { - val mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchemaOption.get(), true, true).mergeSchema() - val mergedSchema = SparkInternalSchemaConverter.constructSparkSchemaFromInternalSchema(mergedInternalSchema) - typeChangeInfos = SparkInternalSchemaConverter.collectTypeChangedCols(querySchemaOption.get(), mergedInternalSchema) - hadoopAttempConf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, mergedSchema.json) - } - val hadoopAttemptContext = - new TaskAttemptContextImpl(hadoopAttempConf, attemptId) + // Instead, we use FileScanRDD's task completion listener to close this iterator. + val iter = new RecordReaderIterator(vectorizedReader) + try { + vectorizedReader.initialize(split, hadoopAttemptContext) - // Try to push down filters when filter push-down is enabled. - // Notice: This push-down is RowGroups level, not individual records. - if (pushed.isDefined) { - ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get) - } - val taskContext = Option(TaskContext.get()) - if (enableVectorizedReader) { - val vectorizedReader = new Spark32HoodieVectorizedParquetRecordReader( - convertTz.orNull, - datetimeRebaseSpec.mode.toString, - datetimeRebaseSpec.timeZone, - int96RebaseSpec.mode.toString, - int96RebaseSpec.timeZone, - enableOffHeapColumnVector && taskContext.isDefined, - capacity, typeChangeInfos) - val iter = new RecordReaderIterator(vectorizedReader) - // SPARK-23457 Register a task completion listener before `initialization`. - // taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) - try { - vectorizedReader.initialize(split, hadoopAttemptContext) + // NOTE: We're making appending of the partitioned values to the rows read from the + // data file configurable + if (shouldAppendPartitionValues) { logDebug(s"Appending $partitionSchema ${file.partitionValues}") vectorizedReader.initBatch(partitionSchema, file.partitionValues) - if (returningBatch) { - vectorizedReader.enableReturningBatches() - } + } else { + vectorizedReader.initBatch(StructType(Nil), InternalRow.empty) + } - // UnsafeRowParquetRecordReader appends the columns internally to avoid another copy. - iter.asInstanceOf[Iterator[InternalRow]] - } catch { - case e: Throwable => - // SPARK-23457: In case there is an exception in initialization, close the iterator to - // avoid leaking resources. - iter.close() - throw e + if (returningBatch) { + vectorizedReader.enableReturningBatches() } - } else { - logDebug(s"Falling back to parquet-mr") + + // UnsafeRowParquetRecordReader appends the columns internally to avoid another copy. + iter.asInstanceOf[Iterator[InternalRow]] + } catch { + case e: Throwable => + // SPARK-23457: In case there is an exception in initialization, close the iterator to + // avoid leaking resources. + iter.close() + throw e + } + } else { + logDebug(s"Falling back to parquet-mr") + val readSupport = if (HoodieSparkUtils.gteqSpark3_2_1) { // ParquetRecordReader returns InternalRow - val readSupport = new ParquetReadSupport( + // NOTE: Below code could only be compiled against >= Spark 3.2.1, + // and unfortunately won't compile against Spark 3.2.0 + // However this code is runtime-compatible w/ both Spark 3.2.0 and >= Spark 3.2.1 + val int96RebaseSpec = + DataSourceUtils.int96RebaseSpec(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) + val datetimeRebaseSpec = + DataSourceUtils.datetimeRebaseSpec(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) + new ParquetReadSupport( convertTz, enableVectorizedReader = false, datetimeRebaseSpec, int96RebaseSpec) - val reader = if (pushed.isDefined && enableRecordFilter) { - val parquetFilter = FilterCompat.get(pushed.get, null) - new ParquetRecordReader[InternalRow](readSupport, parquetFilter) - } else { - new ParquetRecordReader[InternalRow](readSupport) - } - val iter = new RecordReaderIterator[InternalRow](reader) - // SPARK-23457 Register a task completion listener before `initialization`. - taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) + } else { + val datetimeRebaseMode = + Spark32DataSourceUtils.datetimeRebaseMode(footerFileMetaData.getKeyValueMetaData.get, datetimeRebaseModeInRead) + val int96RebaseMode = + Spark32DataSourceUtils.int96RebaseMode(footerFileMetaData.getKeyValueMetaData.get, int96RebaseModeInRead) + createParquetReadSupport( + convertTz, + /* enableVectorizedReader = */ false, + datetimeRebaseMode, + int96RebaseMode) + } + + val reader = if (pushed.isDefined && enableRecordFilter) { + val parquetFilter = FilterCompat.get(pushed.get, null) + new ParquetRecordReader[InternalRow](readSupport, parquetFilter) + } else { + new ParquetRecordReader[InternalRow](readSupport) + } + val iter = new RecordReaderIterator[InternalRow](reader) + try { reader.initialize(split, hadoopAttemptContext) val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes @@ -274,21 +375,76 @@ class Spark32HoodieParquetFileFormat extends ParquetFileFormat { GenerateUnsafeProjection.generate(castSchema, newFullSchema) } - if (partitionSchema.length == 0) { + // NOTE: We're making appending of the partitioned values to the rows read from the + // data file configurable + if (!shouldAppendPartitionValues || partitionSchema.length == 0) { // There is no partition columns iter.map(unsafeProjection) } else { val joinedRow = new JoinedRow() iter.map(d => unsafeProjection(joinedRow(d, file.partitionValues))) } + } catch { + case e: Throwable => + // SPARK-23457: In case there is an exception in initialization, close the iterator to + // avoid leaking resources. + iter.close() + throw e } } } } + } object Spark32HoodieParquetFileFormat { + /** + * NOTE: This method is specific to Spark 3.2.0 + */ + private def createParquetFilters(args: Any*): ParquetFilters = { + // NOTE: ParquetFilters ctor args contain Scala enum, therefore we can't look it + // up by arg types, and have to instead rely on the number of args based on individual class; + // the ctor order is not guaranteed + val ctor = classOf[ParquetFilters].getConstructors.maxBy(_.getParameterCount) + ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*) + .asInstanceOf[ParquetFilters] + } + + /** + * NOTE: This method is specific to Spark 3.2.0 + */ + private def createParquetReadSupport(args: Any*): ParquetReadSupport = { + // NOTE: ParquetReadSupport ctor args contain Scala enum, therefore we can't look it + // up by arg types, and have to instead rely on the number of args based on individual class; + // the ctor order is not guaranteed + val ctor = classOf[ParquetReadSupport].getConstructors.maxBy(_.getParameterCount) + ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*) + .asInstanceOf[ParquetReadSupport] + } + + /** + * NOTE: This method is specific to Spark 3.2.0 + */ + private def createVectorizedParquetRecordReader(args: Any*): VectorizedParquetRecordReader = { + // NOTE: ParquetReadSupport ctor args contain Scala enum, therefore we can't look it + // up by arg types, and have to instead rely on the number of args based on individual class; + // the ctor order is not guaranteed + val ctor = classOf[VectorizedParquetRecordReader].getConstructors.maxBy(_.getParameterCount) + ctor.newInstance(args.map(_.asInstanceOf[AnyRef]): _*) + .asInstanceOf[VectorizedParquetRecordReader] + } + + def pruneInternalSchema(internalSchemaStr: String, requiredSchema: StructType): String = { + val querySchemaOption = SerDeHelper.fromJson(internalSchemaStr) + if (querySchemaOption.isPresent && requiredSchema.nonEmpty) { + val prunedSchema = SparkInternalSchemaConverter.convertAndPruneStructTypeToInternalSchema(requiredSchema, querySchemaOption.get()) + SerDeHelper.toJson(prunedSchema) + } else { + internalSchemaStr + } + } + private def rebuildFilterFromParquet(oldFilter: Filter, fileSchema: InternalSchema, querySchema: InternalSchema): Filter = { if (fileSchema == null || querySchema == null) { oldFilter diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala index d9858b69cc081..82ea356215ca5 100644 --- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala @@ -139,7 +139,7 @@ class HoodieCatalog extends DelegatingCatalogExtension override def renameTable(oldIdent: Identifier, newIdent: Identifier): Unit = { loadTable(oldIdent) match { case _: HoodieInternalV2Table => - new AlterHoodieTableRenameCommand(oldIdent.asTableIdentifier, newIdent.asTableIdentifier, false).run(spark) + AlterHoodieTableRenameCommand(oldIdent.asTableIdentifier, newIdent.asTableIdentifier, false).run(spark) case _ => super.renameTable(oldIdent, newIdent) } } diff --git a/hudi-spark-datasource/hudi-spark3/src/test/resources/log4j-surefire.properties b/hudi-spark-datasource/hudi-spark3/src/test/resources/log4j-surefire.properties index 32af462093ae5..14bbb089724c8 100644 --- a/hudi-spark-datasource/hudi-spark3/src/test/resources/log4j-surefire.properties +++ b/hudi-spark-datasource/hudi-spark3/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-sync/hudi-datahub-sync/src/test/resources/log4j-surefire.properties b/hudi-sync/hudi-datahub-sync/src/test/resources/log4j-surefire.properties index 7914f0a78273b..9782bf7d9c378 100644 --- a/hudi-sync/hudi-datahub-sync/src/test/resources/log4j-surefire.properties +++ b/hudi-sync/hudi-datahub-sync/src/test/resources/log4j-surefire.properties @@ -18,9 +18,9 @@ log4j.rootLogger=WARN, CONSOLE log4j.logger.org.apache.hudi=INFO -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-sync/hudi-dla-sync/src/test/resources/log4j-surefire.properties b/hudi-sync/hudi-dla-sync/src/test/resources/log4j-surefire.properties index c03e808cca1f8..fe87e2deeb071 100644 --- a/hudi-sync/hudi-dla-sync/src/test/resources/log4j-surefire.properties +++ b/hudi-sync/hudi-dla-sync/src/test/resources/log4j-surefire.properties @@ -19,9 +19,9 @@ log4j.rootLogger=WARN, CONSOLE log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-sync/hudi-hive-sync/src/test/resources/log4j-surefire.properties b/hudi-sync/hudi-hive-sync/src/test/resources/log4j-surefire.properties index c03e808cca1f8..fe87e2deeb071 100644 --- a/hudi-sync/hudi-hive-sync/src/test/resources/log4j-surefire.properties +++ b/hudi-sync/hudi-hive-sync/src/test/resources/log4j-surefire.properties @@ -19,9 +19,9 @@ log4j.rootLogger=WARN, CONSOLE log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-sync/hudi-sync-common/src/test/resources/log4j-surefire.properties b/hudi-sync/hudi-sync-common/src/test/resources/log4j-surefire.properties index c03e808cca1f8..fe87e2deeb071 100644 --- a/hudi-sync/hudi-sync-common/src/test/resources/log4j-surefire.properties +++ b/hudi-sync/hudi-sync-common/src/test/resources/log4j-surefire.properties @@ -19,9 +19,9 @@ log4j.rootLogger=WARN, CONSOLE log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-timeline-service/src/test/resources/log4j-surefire.properties b/hudi-timeline-service/src/test/resources/log4j-surefire.properties index c03e808cca1f8..fe87e2deeb071 100644 --- a/hudi-timeline-service/src/test/resources/log4j-surefire.properties +++ b/hudi-timeline-service/src/test/resources/log4j-surefire.properties @@ -19,9 +19,9 @@ log4j.rootLogger=WARN, CONSOLE log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java index ce2be7d5038dc..7d21140ae7105 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java @@ -171,13 +171,15 @@ public static void main(String[] args) { System.exit(1); } final JavaSparkContext jsc = UtilHelpers.buildSparkContext("compactor-" + cfg.tableName, cfg.sparkMaster, cfg.sparkMemory); + int ret = 0; try { HoodieCompactor compactor = new HoodieCompactor(jsc, cfg); - compactor.compact(cfg.retry); + ret = compactor.compact(cfg.retry); } catch (Throwable throwable) { LOG.error("Fail to run compaction for " + cfg.tableName, throwable); } finally { jsc.stop(); + System.exit(ret); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieIndexer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieIndexer.java index 2741e2b98a667..96f6ce38cda48 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieIndexer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieIndexer.java @@ -46,9 +46,14 @@ import java.util.Set; import java.util.stream.Collectors; +import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE_METADATA_INDEX_BLOOM_FILTER; +import static org.apache.hudi.common.config.HoodieMetadataConfig.ENABLE_METADATA_INDEX_COLUMN_STATS; import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getCompletedMetadataPartitions; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getInflightAndCompletedMetadataPartitions; import static org.apache.hudi.utilities.UtilHelpers.EXECUTE; import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE; import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE_AND_EXECUTE; @@ -80,7 +85,7 @@ public class HoodieIndexer { private static final Logger LOG = LogManager.getLogger(HoodieIndexer.class); - private static final String DROP_INDEX = "dropindex"; + static final String DROP_INDEX = "dropindex"; private final HoodieIndexer.Config cfg; private TypedProperties props; @@ -164,6 +169,19 @@ public int start(int retry) { return -1; } + // all inflight or completed metadata partitions have already been initialized + // so enable corresponding indexes in the props so that they're not deleted + Set initializedMetadataPartitions = getInflightAndCompletedMetadataPartitions(metaClient.getTableConfig()); + LOG.info("Setting props for: " + initializedMetadataPartitions); + initializedMetadataPartitions.forEach(p -> { + if (PARTITION_NAME_COLUMN_STATS.equals(p)) { + props.setProperty(ENABLE_METADATA_INDEX_COLUMN_STATS.key(), "true"); + } + if (PARTITION_NAME_BLOOM_FILTERS.equals(p)) { + props.setProperty(ENABLE_METADATA_INDEX_BLOOM_FILTER.key(), "true"); + } + }); + return UtilHelpers.retry(retry, () -> { switch (cfg.runningMode.toLowerCase()) { case SCHEDULE: { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java index a693bb4c65e47..80cc56a4e579b 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java @@ -68,6 +68,7 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.MessageType; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -489,7 +490,8 @@ private List validatePartitions(HoodieSparkEngineContext engineContext, // ignore partitions created by uncommitted ingestion. allPartitionPathsFromFS = allPartitionPathsFromFS.stream().parallel().filter(part -> { - HoodiePartitionMetadata hoodiePartitionMetadata = new HoodiePartitionMetadata(metaClient.getFs(), new Path(basePath, part)); + HoodiePartitionMetadata hoodiePartitionMetadata = + new HoodiePartitionMetadata(metaClient.getFs(), FSUtils.getPartitionPath(basePath, part)); Option instantOption = hoodiePartitionMetadata.readPartitionCreatedCommitTime(); if (instantOption.isPresent()) { @@ -808,9 +810,14 @@ private boolean hasCommittedLogFiles( for (String logFilePathStr : logFilePathSet) { HoodieLogFormat.Reader reader = null; try { - Schema readerSchema = - converter.convert(Objects.requireNonNull( - TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePathStr)))); + MessageType messageType = + TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePathStr)); + if (messageType == null) { + LOG.warn(String.format("Cannot read schema from log file %s. " + + "Skip the check as it's likely being written by an inflight instant.", logFilePathStr)); + continue; + } + Schema readerSchema = converter.convert(messageType); reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFilePathStr)), readerSchema); // read the avro blocks @@ -829,7 +836,9 @@ private boolean hasCommittedLogFiles( LOG.warn("There is no log block in " + logFilePathStr); } } catch (IOException e) { - throw new HoodieValidationException("Validation failed due to IOException", e); + LOG.warn(String.format("Cannot read log file %s: %s. " + + "Skip the check as it's likely being written by an inflight instant.", + logFilePathStr, e.getMessage()), e); } finally { FileIOUtils.closeQuietly(reader); } @@ -975,7 +984,7 @@ public List> getSortedColumnStatsList( return baseFileNameList.stream().flatMap(filename -> new ParquetUtils().readRangeFromParquetMetadata( metaClient.getHadoopConf(), - new Path(new Path(metaClient.getBasePath(), partitionPath), filename), + new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), partitionPath), filename), allColumnNameList).stream()) .sorted(new HoodieColumnRangeMetadataComparator()) .collect(Collectors.toList()); @@ -1016,7 +1025,7 @@ private List getAllColumnNames() { } private Option readBloomFilterFromFile(String partitionPath, String filename) { - Path path = new Path(new Path(metaClient.getBasePath(), partitionPath), filename); + Path path = new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), partitionPath), filename); HoodieFileReader fileReader; try { fileReader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), path); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java index 5d1fd19267911..f389695f7bb4d 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java @@ -279,6 +279,7 @@ private static SparkConf buildSparkConf(String appName, String defaultMaster, Ma sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); + sparkConf.set("spark.driver.allowMultipleContexts", "true"); additionalConfigs.forEach(sparkConf::set); return SparkRDDWriteClient.registerClasses(sparkConf); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index 0e57bd379acdb..b086a6c9edbab 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -105,6 +105,7 @@ import scala.collection.JavaConversions; import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; +import static org.apache.hudi.common.table.HoodieTableConfig.DROP_PARTITION_COLUMNS; import static org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE; import static org.apache.hudi.config.HoodieClusteringConfig.INLINE_CLUSTERING; import static org.apache.hudi.config.HoodieCompactionConfig.INLINE_COMPACT; @@ -280,6 +281,7 @@ public void refreshTimeline() throws IOException { .setPreCombineField(cfg.sourceOrderingField) .setPartitionMetafileUseBaseFormat(props.getBoolean(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(), HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.defaultValue())) + .setShouldDropPartitionColumns(isDropPartitionColumns()) .initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath); } @@ -375,6 +377,7 @@ public Pair>> readFromSource( SimpleKeyGenerator.class.getName())) .setPartitionMetafileUseBaseFormat(props.getBoolean(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(), HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.defaultValue())) + .setShouldDropPartitionColumns(isDropPartitionColumns()) .initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath); } @@ -477,14 +480,16 @@ private Pair>> fetchFromSourc } boolean shouldCombine = cfg.filterDupes || cfg.operation.equals(WriteOperationType.UPSERT); + List partitionColumns = getPartitionColumns(keyGenerator, props); JavaRDD avroRDD = avroRDDOptional.get(); - JavaRDD records = avroRDD.map(gr -> { + JavaRDD records = avroRDD.map(record -> { + GenericRecord gr = isDropPartitionColumns() ? HoodieAvroUtils.removeFields(record, partitionColumns) : record; HoodieRecordPayload payload = shouldCombine ? DataSourceUtils.createPayload(cfg.payloadClassName, gr, (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false, props.getBoolean( KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())))) : DataSourceUtils.createPayload(cfg.payloadClassName, gr); - return new HoodieAvroRecord<>(keyGenerator.getKey(gr), payload); + return new HoodieAvroRecord<>(keyGenerator.getKey(record), payload); }); return Pair.of(schemaProvider, Pair.of(checkpointStr, records)); @@ -727,6 +732,9 @@ public void setupWriteClient() throws IOException { private void reInitWriteClient(Schema sourceSchema, Schema targetSchema) throws IOException { LOG.info("Setting up new Hoodie Write Client"); + if (isDropPartitionColumns()) { + targetSchema = HoodieAvroUtils.removeFields(targetSchema, getPartitionColumns(keyGenerator, props)); + } registerAvroSchemas(sourceSchema, targetSchema); HoodieWriteConfig hoodieCfg = getHoodieClientConfig(targetSchema); if (hoodieCfg.isEmbeddedTimelineServerEnabled()) { @@ -898,4 +906,24 @@ public Option getClusteringInstantOpt() { return Option.empty(); } } + + /** + * Set based on hoodie.datasource.write.drop.partition.columns config. + * When set to true, will not write the partition columns into the table. + */ + private Boolean isDropPartitionColumns() { + return props.getBoolean(DROP_PARTITION_COLUMNS.key(), DROP_PARTITION_COLUMNS.defaultValue()); + } + + /** + * Get the list of partition columns as a list of strings. + * + * @param keyGenerator KeyGenerator + * @param props TypedProperties + * @return List of partition columns. + */ + private List getPartitionColumns(KeyGenerator keyGenerator, TypedProperties props) { + String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props); + return Arrays.asList(partitionColumns.split(",")); + } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java index 9ce8eef313de3..9312a26b4f950 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java @@ -23,14 +23,24 @@ import org.apache.hudi.avro.model.HoodieIndexPartitionInfo; import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.testutils.providers.SparkProvider; +import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SparkSession; @@ -40,7 +50,20 @@ import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.Objects; +import static org.apache.hudi.common.table.HoodieTableMetaClient.reload; +import static org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getCompletedMetadataPartitions; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.metadataPartitionExists; +import static org.apache.hudi.metadata.MetadataPartitionType.BLOOM_FILTERS; +import static org.apache.hudi.metadata.MetadataPartitionType.COLUMN_STATS; +import static org.apache.hudi.metadata.MetadataPartitionType.FILES; +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.apache.hudi.utilities.HoodieIndexer.DROP_INDEX; +import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE; +import static org.apache.hudi.utilities.UtilHelpers.SCHEDULE_AND_EXECUTE; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -64,7 +87,15 @@ public void init() throws IOException { context = new HoodieSparkEngineContext(jsc); } initPath(); - metaClient = HoodieTestUtils.init(basePath, getTableType()); + initMetaClient(); + } + + protected void initMetaClient() throws IOException { + String rootPathStr = "file://" + tempDir.toAbsolutePath().toString(); + Path rootPath = new Path(rootPathStr); + rootPath.getFileSystem(jsc.hadoopConfiguration()).mkdirs(rootPath); + metaClient = HoodieTestUtils.init(rootPathStr, getTableType()); + basePath = metaClient.getBasePath(); } @Test @@ -75,9 +106,9 @@ public void testGetRequestedPartitionTypes() { config.indexTypes = "FILES,BLOOM_FILTERS,COLUMN_STATS"; HoodieIndexer indexer = new HoodieIndexer(jsc, config); List partitionTypes = indexer.getRequestedPartitionTypes(config.indexTypes); - assertFalse(partitionTypes.contains(MetadataPartitionType.FILES)); - assertTrue(partitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS)); - assertTrue(partitionTypes.contains(MetadataPartitionType.COLUMN_STATS)); + assertFalse(partitionTypes.contains(FILES)); + assertTrue(partitionTypes.contains(BLOOM_FILTERS)); + assertTrue(partitionTypes.contains(COLUMN_STATS)); } @Test @@ -90,7 +121,7 @@ public void testIsIndexBuiltForAllRequestedTypes() { HoodieIndexCommitMetadata commitMetadata = HoodieIndexCommitMetadata.newBuilder() .setIndexPartitionInfos(Arrays.asList(new HoodieIndexPartitionInfo( 1, - MetadataPartitionType.COLUMN_STATS.getPartitionPath(), + COLUMN_STATS.getPartitionPath(), "0000"))) .build(); assertFalse(indexer.isIndexBuiltForAllRequestedTypes(commitMetadata.getIndexPartitionInfos())); @@ -100,6 +131,204 @@ public void testIsIndexBuiltForAllRequestedTypes() { assertTrue(indexer.isIndexBuiltForAllRequestedTypes(commitMetadata.getIndexPartitionInfos())); } + @Test + public void testIndexerWithNotAllIndexesEnabled() { + initTestDataGenerator(); + String tableName = "indexer_test"; + HoodieWriteConfig.Builder writeConfigBuilder = getWriteConfigBuilder(basePath, tableName); + // enable files and bloom_filters on the regular write client + HoodieMetadataConfig.Builder metadataConfigBuilder = getMetadataConfigBuilder(true, false).withMetadataIndexBloomFilter(true); + HoodieWriteConfig writeConfig = writeConfigBuilder.withMetadataConfig(metadataConfigBuilder.build()).build(); + // do one upsert with synchronous metadata update + SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context, writeConfig); + String instant = "0001"; + writeClient.startCommitWithTime(instant); + List records = dataGen.generateInserts(instant, 100); + JavaRDD result = writeClient.upsert(jsc.parallelize(records, 1), instant); + List statuses = result.collect(); + assertNoWriteErrors(statuses); + + // validate table config + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath())); + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath())); + + // build indexer config which has only column_stats enabled (files is enabled by default) + HoodieIndexer.Config config = new HoodieIndexer.Config(); + String propsPath = Objects.requireNonNull(getClass().getClassLoader().getResource("delta-streamer-config/indexer.properties")).getPath(); + config.basePath = basePath; + config.tableName = tableName; + config.indexTypes = COLUMN_STATS.name(); + config.runningMode = SCHEDULE_AND_EXECUTE; + config.propsFilePath = propsPath; + // start the indexer and validate column_stats index is also complete + HoodieIndexer indexer = new HoodieIndexer(jsc, config); + assertEquals(0, indexer.start(0)); + + // validate table config + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath())); + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath())); + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(COLUMN_STATS.getPartitionPath())); + // validate metadata partitions actually exist + assertTrue(metadataPartitionExists(basePath, context, FILES)); + assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS)); + assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS)); + } + + @Test + public void testIndexerDropPartitionDeletesInstantFromTimeline() { + initTestDataGenerator(); + String tableName = "indexer_test"; + HoodieWriteConfig.Builder writeConfigBuilder = getWriteConfigBuilder(basePath, tableName); + // enable files on the regular write client + HoodieMetadataConfig.Builder metadataConfigBuilder = getMetadataConfigBuilder(true, false).withMetadataIndexBloomFilter(true); + HoodieWriteConfig writeConfig = writeConfigBuilder.withMetadataConfig(metadataConfigBuilder.build()).build(); + // do one upsert with synchronous metadata update + SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context, writeConfig); + String instant = "0001"; + writeClient.startCommitWithTime(instant); + List records = dataGen.generateInserts(instant, 100); + JavaRDD result = writeClient.upsert(jsc.parallelize(records, 1), instant); + List statuses = result.collect(); + assertNoWriteErrors(statuses); + + // validate partitions built successfully + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath())); + assertTrue(metadataPartitionExists(basePath, context, FILES)); + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath())); + assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS)); + + // build indexer config which has only column_stats enabled (files is enabled by default) + HoodieIndexer.Config config = new HoodieIndexer.Config(); + String propsPath = Objects.requireNonNull(getClass().getClassLoader().getResource("delta-streamer-config/indexer.properties")).getPath(); + config.basePath = basePath; + config.tableName = tableName; + config.indexTypes = COLUMN_STATS.name(); + config.runningMode = SCHEDULE; + config.propsFilePath = propsPath; + + // schedule indexing and validate column_stats index is also initialized + HoodieIndexer indexer = new HoodieIndexer(jsc, config); + assertEquals(0, indexer.start(0)); + Option indexInstantInTimeline = metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant(); + assertTrue(indexInstantInTimeline.isPresent()); + assertEquals(REQUESTED, indexInstantInTimeline.get().getState()); + assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS)); + + // drop column_stats and validate indexing.requested is also removed from the timeline + config.runningMode = DROP_INDEX; + indexer = new HoodieIndexer(jsc, config); + assertEquals(0, indexer.start(0)); + indexInstantInTimeline = metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant(); + assertFalse(indexInstantInTimeline.isPresent()); + assertFalse(metadataPartitionExists(basePath, context, COLUMN_STATS)); + + // check other partitions are intact + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath())); + assertTrue(metadataPartitionExists(basePath, context, FILES)); + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath())); + assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS)); + } + + @Test + public void testTwoIndexersOneCreateOneDropPartition() { + initTestDataGenerator(); + String tableName = "indexer_test"; + HoodieWriteConfig.Builder writeConfigBuilder = getWriteConfigBuilder(basePath, tableName); + // enable files on the regular write client + HoodieMetadataConfig.Builder metadataConfigBuilder = getMetadataConfigBuilder(true, false); + HoodieWriteConfig writeConfig = writeConfigBuilder.withMetadataConfig(metadataConfigBuilder.build()).build(); + // do one upsert with synchronous metadata update + SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context, writeConfig); + String instant = "0001"; + writeClient.startCommitWithTime(instant); + List records = dataGen.generateInserts(instant, 100); + JavaRDD result = writeClient.upsert(jsc.parallelize(records, 1), instant); + List statuses = result.collect(); + assertNoWriteErrors(statuses); + + // validate files partition built successfully + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath())); + assertTrue(metadataPartitionExists(basePath, context, FILES)); + + // build indexer config which has only bloom_filters enabled + HoodieIndexer.Config config = getHoodieIndexConfig(BLOOM_FILTERS.name(), SCHEDULE_AND_EXECUTE, "delta-streamer-config/indexer-only-bloom.properties"); + // start the indexer and validate bloom_filters index is also complete + HoodieIndexer indexer = new HoodieIndexer(jsc, config); + assertEquals(0, indexer.start(0)); + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath())); + assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS)); + + // completed index timeline for later validation + Option bloomIndexInstant = metaClient.reloadActiveTimeline().filterCompletedIndexTimeline().lastInstant(); + assertTrue(bloomIndexInstant.isPresent()); + + // build indexer config which has only column_stats enabled + config = getHoodieIndexConfig(COLUMN_STATS.name(), SCHEDULE, "delta-streamer-config/indexer.properties"); + + // schedule indexing and validate column_stats index is also initialized + // and indexing.requested instant is present + indexer = new HoodieIndexer(jsc, config); + assertEquals(0, indexer.start(0)); + Option columnStatsIndexInstant = metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant(); + assertTrue(columnStatsIndexInstant.isPresent()); + assertEquals(REQUESTED, columnStatsIndexInstant.get().getState()); + assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS)); + + // drop column_stats and validate indexing.requested is also removed from the timeline + // and completed indexing instant corresponding to bloom_filters index is still present + dropIndexAndAssert(COLUMN_STATS, "delta-streamer-config/indexer.properties", Option.empty()); + + // check other partitions are intact + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath())); + assertTrue(metadataPartitionExists(basePath, context, FILES)); + assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath())); + assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS)); + + // drop bloom filter partition. timeline files should not be deleted since the index building is complete. + dropIndexAndAssert(BLOOM_FILTERS, "delta-streamer-config/indexer-only-bloom.properties", bloomIndexInstant); + } + + private void dropIndexAndAssert(MetadataPartitionType indexType, String resourceFilePath, Option completedIndexInstant) { + HoodieIndexer.Config config = getHoodieIndexConfig(indexType.name(), DROP_INDEX, resourceFilePath); + HoodieIndexer indexer = new HoodieIndexer(jsc, config); + assertEquals(0, indexer.start(0)); + Option pendingFlights = metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant(); + assertFalse(pendingFlights.isPresent()); + assertFalse(metadataPartitionExists(basePath, context, indexType)); + if (completedIndexInstant.isPresent()) { + assertEquals(completedIndexInstant, metaClient.reloadActiveTimeline().filterCompletedIndexTimeline().lastInstant()); + } + } + + private HoodieIndexer.Config getHoodieIndexConfig(String indexType, String runMode, String resourceFilePath) { + HoodieIndexer.Config config = new HoodieIndexer.Config(); + String propsPath = Objects.requireNonNull(getClass().getClassLoader().getResource(resourceFilePath)).getPath(); + config.basePath = basePath; + config.tableName = tableName; + config.indexTypes = indexType; + config.runningMode = runMode; + config.propsFilePath = propsPath; + return config; + } + + private static HoodieWriteConfig.Builder getWriteConfigBuilder(String basePath, String tableName) { + return HoodieWriteConfig.newBuilder() + .withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2) + .withBulkInsertParallelism(2) + .withFinalizeWriteParallelism(2) + .withDeleteParallelism(2) + .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .forTable(tableName); + } + + private static HoodieMetadataConfig.Builder getMetadataConfigBuilder(boolean enable, boolean asyncIndex) { + return HoodieMetadataConfig.newBuilder() + .enable(enable) + .withAsyncIndex(asyncIndex); + } + @Override public HoodieEngineContext context() { return context; diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java index 2db72cbd4102e..0576f6aaee88b 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java @@ -104,6 +104,7 @@ import org.apache.spark.sql.types.StructField; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Tag; @@ -123,6 +124,7 @@ import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; @@ -380,7 +382,7 @@ static void waitTillCondition(Function condition, Future dsFut ret = false; } } - return true; + return ret; }); res.get(timeoutInSecs, TimeUnit.SECONDS); } @@ -1028,17 +1030,20 @@ public void testHoodieIndexer() throws Exception { }); } - @Disabled("HUDI-3710 to fix the ConcurrentModificationException") @ParameterizedTest @ValueSource(booleans = {true, false}) public void testHoodieAsyncClusteringJob(boolean shouldPassInClusteringInstantTime) throws Exception { String tableBasePath = dfsBasePath + "/asyncClusteringJob"; - - HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "true"); + HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "false"); + CountDownLatch countDownLatch = new CountDownLatch(1); deltaStreamerTestRunner(ds, (r) -> { TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs); + countDownLatch.countDown(); + return true; + }); + if (countDownLatch.await(2, TimeUnit.MINUTES)) { Option scheduleClusteringInstantTime = Option.empty(); try { HoodieClusteringJob scheduleClusteringJob = @@ -1046,7 +1051,7 @@ public void testHoodieAsyncClusteringJob(boolean shouldPassInClusteringInstantTi scheduleClusteringInstantTime = scheduleClusteringJob.doSchedule(); } catch (Exception e) { LOG.warn("Schedule clustering failed", e); - return false; + Assertions.fail("Schedule clustering failed", e); } if (scheduleClusteringInstantTime.isPresent()) { LOG.info("Schedule clustering success, now cluster with instant time " + scheduleClusteringInstantTime.get()); @@ -1054,13 +1059,15 @@ public void testHoodieAsyncClusteringJob(boolean shouldPassInClusteringInstantTi shouldPassInClusteringInstantTime ? scheduleClusteringInstantTime.get() : null, false); HoodieClusteringJob clusterClusteringJob = new HoodieClusteringJob(jsc, clusterClusteringConfig); clusterClusteringJob.cluster(clusterClusteringConfig.retry); + TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs); LOG.info("Cluster success"); } else { - LOG.warn("Schedule clustering failed"); + LOG.warn("Clustering execution failed"); + Assertions.fail("Clustering execution failed"); } - TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs); - return true; - }); + } else { + Assertions.fail("Deltastreamer should have completed 2 commits."); + } } @Test @@ -1683,7 +1690,6 @@ public void testParquetDFSSourceForEmptyBatch() throws Exception { testParquetDFSSource(false, null, true); } - @Disabled("HUDI-3707 To investigate problem with schema provider and transformer") @Test public void testParquetDFSSourceWithoutSchemaProviderAndTransformer() throws Exception { testParquetDFSSource(false, Collections.singletonList(TripsWithDistanceTransformer.class.getName())); @@ -1694,7 +1700,6 @@ public void testParquetDFSSourceWithSourceSchemaFileAndNoTransformer() throws Ex testParquetDFSSource(true, null); } - @Disabled("HUDI-3707 To investigate problem with schema provider and transformer") @Test public void testParquetDFSSourceWithSchemaFilesAndTransformer() throws Exception { testParquetDFSSource(true, Collections.singletonList(TripsWithDistanceTransformer.class.getName())); @@ -1705,7 +1710,6 @@ public void testORCDFSSourceWithoutSchemaProviderAndNoTransformer() throws Excep testORCDFSSource(false, null); } - @Disabled("HUDI-3707 To investigate problem with schema provider and transformer") @Test public void testORCDFSSourceWithSchemaProviderAndWithTransformer() throws Exception { testORCDFSSource(true, Collections.singletonList(TripsWithDistanceTransformer.class.getName())); @@ -1800,7 +1804,6 @@ public void testCsvDFSSourceWithHeaderAndSepWithoutSchemaProviderAndWithTransfor testCsvDFSSource(true, '\t', false, Collections.singletonList(TripsWithDistanceTransformer.class.getName())); } - @Disabled("HUDI-3707 To investigate problem with schema provider and transformer") @Test public void testCsvDFSSourceWithHeaderAndSepWithSchemaProviderAndTransformer() throws Exception { // The CSV files have header, the columns are separated by '\t' @@ -1843,7 +1846,6 @@ public void testCsvDFSSourceNoHeaderWithoutSchemaProviderAndWithTransformer() th assertTrue(e.getMessage().contains("cannot resolve '`begin_lat`' given input columns:")); } - @Disabled("HUDI-3707 To investigate problem with schema provider and transformer") @Test public void testCsvDFSSourceNoHeaderWithSchemaProviderAndTransformer() throws Exception { // The CSV files do not have header, the columns are separated by '\t' diff --git a/hudi-utilities/src/test/resources/delta-streamer-config/indexer-only-bloom.properties b/hudi-utilities/src/test/resources/delta-streamer-config/indexer-only-bloom.properties new file mode 100644 index 0000000000000..6035077437068 --- /dev/null +++ b/hudi-utilities/src/test/resources/delta-streamer-config/indexer-only-bloom.properties @@ -0,0 +1,25 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.metadata.enable=true +hoodie.metadata.index.async=true +hoodie.metadata.index.bloom.filter.enable=true +hoodie.metadata.index.check.timeout.seconds=60 +hoodie.write.concurrency.mode=optimistic_concurrency_control +hoodie.write.lock.provider=org.apache.hudi.client.transaction.lock.InProcessLockProvider \ No newline at end of file diff --git a/hudi-utilities/src/test/resources/delta-streamer-config/schema_registry.source_schema_tab.sql b/hudi-utilities/src/test/resources/delta-streamer-config/schema_registry.source_schema_tab.sql index b95ae0f5ee151..a5714041cf850 100644 --- a/hudi-utilities/src/test/resources/delta-streamer-config/schema_registry.source_schema_tab.sql +++ b/hudi-utilities/src/test/resources/delta-streamer-config/schema_registry.source_schema_tab.sql @@ -1,3 +1,19 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"), you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + CREATE TABLE IF NOT EXISTS `schema_registry`.`source_schema_tab`( `id` BIGINT, `name` STRING, @@ -9,4 +25,4 @@ CREATE TABLE IF NOT EXISTS `schema_registry`.`source_schema_tab`( `num6` DOUBLE, `bool` BOOLEAN, `bin` BINARY -) \ No newline at end of file +) diff --git a/hudi-utilities/src/test/resources/delta-streamer-config/schema_registry.target_schema_tab.sql b/hudi-utilities/src/test/resources/delta-streamer-config/schema_registry.target_schema_tab.sql index 07f179f90ed93..3adfa21bb5630 100644 --- a/hudi-utilities/src/test/resources/delta-streamer-config/schema_registry.target_schema_tab.sql +++ b/hudi-utilities/src/test/resources/delta-streamer-config/schema_registry.target_schema_tab.sql @@ -1,3 +1,19 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"), you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + CREATE TABLE IF NOT EXISTS `schema_registry`.`target_schema_tab`( `id` BIGINT, `name` STRING, @@ -9,4 +25,4 @@ CREATE TABLE IF NOT EXISTS `schema_registry`.`target_schema_tab`( `num6` DOUBLE, `bool` BOOLEAN, `bin` BINARY -) \ No newline at end of file +) diff --git a/hudi-utilities/src/test/resources/log4j-surefire.properties b/hudi-utilities/src/test/resources/log4j-surefire.properties index c5bdf75ae2ae3..31841a6a3a1ec 100644 --- a/hudi-utilities/src/test/resources/log4j-surefire.properties +++ b/hudi-utilities/src/test/resources/log4j-surefire.properties @@ -20,9 +20,9 @@ log4j.logger.org.apache=INFO log4j.logger.org.apache.hudi=DEBUG log4j.logger.org.apache.hadoop.hbase=ERROR -# A1 is set to be a ConsoleAppender. +# CONSOLE is set to be a ConsoleAppender. log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender -# A1 uses PatternLayout. +# CONSOLE uses PatternLayout. log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter diff --git a/packaging/hudi-datahub-sync-bundle/pom.xml b/packaging/hudi-datahub-sync-bundle/pom.xml index d45f07f31c23f..1bdff502d3f9a 100644 --- a/packaging/hudi-datahub-sync-bundle/pom.xml +++ b/packaging/hudi-datahub-sync-bundle/pom.xml @@ -63,6 +63,7 @@ META-INF/LICENSE target/classes/META-INF/LICENSE + @@ -70,9 +71,23 @@ org.apache.hudi:hudi-hadoop-mr org.apache.hudi:hudi-sync-common org.apache.hudi:hudi-datahub-sync + org.apache.parquet:parquet-avro io.acryl:datahub-client com.beust:jcommander + commons-io:commons-io + org.apache.hbase:hbase-common + org.apache.hbase:hbase-client + org.apache.hbase:hbase-hadoop-compat + org.apache.hbase:hbase-hadoop2-compat + org.apache.hbase:hbase-metrics + org.apache.hbase:hbase-metrics-api + org.apache.hbase:hbase-protocol-shaded + org.apache.hbase:hbase-server + org.apache.hbase.thirdparty:hbase-shaded-miscellaneous + org.apache.hbase.thirdparty:hbase-shaded-netty + org.apache.hbase.thirdparty:hbase-shaded-protobuf + org.apache.htrace:htrace-core4 org.apache.httpcomponents:fluent-hc org.apache.httpcomponents:httpcore org.apache.httpcomponents:httpclient @@ -80,6 +95,107 @@ org.apache.httpcomponents:httpcore-nio + + + com.esotericsoftware.kryo. + org.apache.hudi.com.esotericsoftware.kryo. + + + com.esotericsoftware.minlog. + org.apache.hudi.com.esotericsoftware.minlog. + + + org.apache.commons.io. + org.apache.hudi.org.apache.commons.io. + + + org.apache.hadoop.hbase. + org.apache.hudi.org.apache.hadoop.hbase. + + org.apache.hadoop.hbase.KeyValue$KeyComparator + + + + org.apache.hbase. + org.apache.hudi.org.apache.hbase. + + + org.apache.htrace. + org.apache.hudi.org.apache.htrace. + + + org.objenesis. + org.apache.hudi.org.objenesis. + + + + org.apache.hadoop.metrics2.MetricHistogram + org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram + + + + org.apache.hadoop.metrics2.MetricsExecutor + org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor + + + + org.apache.hadoop.metrics2.impl.JmxCacheBuster + org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster + + + org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + + + + org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + + + + org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + + + + org.apache.hadoop.metrics2.lib.MutableFastCounter + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter + + + + org.apache.hadoop.metrics2.lib.MutableHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram + + + + org.apache.hadoop.metrics2.lib.MutableRangeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableSizeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableTimeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram + + + + org.apache.hadoop.metrics2.util.MetricQuantile + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile + + + + org.apache.hadoop.metrics2.util.MetricSampleQuantiles + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles + + + false @@ -89,6 +205,8 @@ META-INF/*.DSA META-INF/*.RSA META-INF/services/javax.* + **/*.proto + hbase-webapps/** diff --git a/packaging/hudi-flink-bundle/pom.xml b/packaging/hudi-flink-bundle/pom.xml index 584c3871cd449..a322daaabe9a1 100644 --- a/packaging/hudi-flink-bundle/pom.xml +++ b/packaging/hudi-flink-bundle/pom.xml @@ -153,8 +153,8 @@ org.apache.hbase:hbase-hadoop2-compat org.apache.hbase:hbase-metrics org.apache.hbase:hbase-metrics-api - org.apache.hbase:hbase-server org.apache.hbase:hbase-protocol-shaded + org.apache.hbase:hbase-server org.apache.hbase.thirdparty:hbase-shaded-miscellaneous org.apache.hbase.thirdparty:hbase-shaded-netty org.apache.hbase.thirdparty:hbase-shaded-protobuf diff --git a/packaging/hudi-gcp-bundle/pom.xml b/packaging/hudi-gcp-bundle/pom.xml index 7121076f271b4..97bdf56940814 100644 --- a/packaging/hudi-gcp-bundle/pom.xml +++ b/packaging/hudi-gcp-bundle/pom.xml @@ -87,6 +87,7 @@ META-INF/LICENSE target/classes/META-INF/LICENSE + @@ -94,11 +95,126 @@ org.apache.hudi:hudi-hadoop-mr org.apache.hudi:hudi-sync-common org.apache.hudi:hudi-gcp + org.apache.parquet:parquet-avro com.google.cloud:google-cloud-bigquery com.beust:jcommander + commons-io:commons-io + org.apache.hbase:hbase-common + org.apache.hbase:hbase-client + org.apache.hbase:hbase-hadoop-compat + org.apache.hbase:hbase-hadoop2-compat + org.apache.hbase:hbase-metrics + org.apache.hbase:hbase-metrics-api + org.apache.hbase:hbase-protocol-shaded + org.apache.hbase:hbase-server + org.apache.hbase.thirdparty:hbase-shaded-miscellaneous + org.apache.hbase.thirdparty:hbase-shaded-netty + org.apache.hbase.thirdparty:hbase-shaded-protobuf + org.apache.htrace:htrace-core4 + + + com.esotericsoftware.kryo. + org.apache.hudi.com.esotericsoftware.kryo. + + + com.esotericsoftware.minlog. + org.apache.hudi.com.esotericsoftware.minlog. + + + org.apache.commons.io. + org.apache.hudi.org.apache.commons.io. + + + org.apache.hadoop.hbase. + org.apache.hudi.org.apache.hadoop.hbase. + + org.apache.hadoop.hbase.KeyValue$KeyComparator + + + + org.apache.hbase. + org.apache.hudi.org.apache.hbase. + + + org.apache.htrace. + org.apache.hudi.org.apache.htrace. + + + org.objenesis. + org.apache.hudi.org.objenesis. + + + + org.apache.hadoop.metrics2.MetricHistogram + org.apache.hudi.org.apache.hadoop.metrics2.MetricHistogram + + + + org.apache.hadoop.metrics2.MetricsExecutor + org.apache.hudi.org.apache.hadoop.metrics2.MetricsExecutor + + + + org.apache.hadoop.metrics2.impl.JmxCacheBuster + org.apache.hudi.org.apache.hadoop.metrics2.impl.JmxCacheBuster + + + org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + org.apache.hudi.org.apache.hadoop.metrics2.lib.DefaultMetricsSystemHelper + + + + org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + org.apache.hudi.org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry + + + + org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + org.apache.hudi.org.apache.hadoop.metrics2.lib.MetricsExecutorImpl + + + + org.apache.hadoop.metrics2.lib.MutableFastCounter + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableFastCounter + + + + org.apache.hadoop.metrics2.lib.MutableHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableHistogram + + + + org.apache.hadoop.metrics2.lib.MutableRangeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableRangeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableSizeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableSizeHistogram + + + + org.apache.hadoop.metrics2.lib.MutableTimeHistogram + org.apache.hudi.org.apache.hadoop.metrics2.lib.MutableTimeHistogram + + + + org.apache.hadoop.metrics2.util.MetricQuantile + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricQuantile + + + + org.apache.hadoop.metrics2.util.MetricSampleQuantiles + org.apache.hudi.org.apache.hadoop.metrics2.util.MetricSampleQuantiles + + + false @@ -108,6 +224,8 @@ META-INF/*.DSA META-INF/*.RSA META-INF/services/javax.* + **/*.proto + hbase-webapps/** diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index 612a1b7f30b8b..48fe3c7d64cc0 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -67,8 +67,9 @@ org.apache.hudi:hudi-common org.apache.hudi:hudi-hadoop-mr - + org.apache.parquet:parquet-avro + org.apache.parquet:parquet-hadoop-bundle org.apache.avro:avro com.esotericsoftware:kryo-shaded org.objenesis:objenesis @@ -131,8 +132,12 @@ org.apache.hudi.org.apache.htrace. - org.apache.parquet.avro. - org.apache.hudi.org.apache.parquet.avro. + org.apache.parquet. + org.apache.hudi.org.apache.parquet. + + + shaded.parquet. + org.apache.hudi.shaded.parquet. com.google.common. @@ -258,6 +263,13 @@ compile + + org.apache.parquet + parquet-hadoop-bundle + ${parquet.version} + compile + + org.apache.avro diff --git a/packaging/hudi-kafka-connect-bundle/pom.xml b/packaging/hudi-kafka-connect-bundle/pom.xml index 6adeae0fc2715..2914f2221ebed 100644 --- a/packaging/hudi-kafka-connect-bundle/pom.xml +++ b/packaging/hudi-kafka-connect-bundle/pom.xml @@ -85,7 +85,7 @@ org.apache.hudi:hudi-aws - org.apache.hudi:hudi-flink_${scala.binary.version} + org.apache.hudi:hudi-flink org.apache.hudi:flink-core org.apache.hudi:hudi-flink-client org.apache.flink:flink-core diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index ada62cfee47c7..d744cd7471519 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -74,14 +74,15 @@ com.esotericsoftware:kryo-shaded org.objenesis:objenesis com.esotericsoftware:minlog + org.apache.commons:commons-lang3 org.apache.hbase:hbase-common org.apache.hbase:hbase-client org.apache.hbase:hbase-hadoop-compat org.apache.hbase:hbase-hadoop2-compat org.apache.hbase:hbase-metrics org.apache.hbase:hbase-metrics-api - org.apache.hbase:hbase-protocol org.apache.hbase:hbase-protocol-shaded + org.apache.hbase:hbase-server org.apache.hbase.thirdparty:hbase-shaded-miscellaneous org.apache.hbase.thirdparty:hbase-shaded-netty org.apache.hbase.thirdparty:hbase-shaded-protobuf @@ -149,6 +150,10 @@ org.apache.commons.lang. ${presto.bundle.bootstrap.shade.prefix}org.apache.commons.lang. + + org.apache.commons.lang3. + org.apache.hudi.org.apache.commons.lang3. + com.google.protobuf. ${presto.bundle.bootstrap.shade.prefix}com.google.protobuf. diff --git a/packaging/hudi-trino-bundle/pom.xml b/packaging/hudi-trino-bundle/pom.xml index 5052038507a8a..d2423f2835137 100644 --- a/packaging/hudi-trino-bundle/pom.xml +++ b/packaging/hudi-trino-bundle/pom.xml @@ -83,7 +83,6 @@ org.apache.hbase:hbase-metrics-api org.apache.hbase:hbase-protocol-shaded org.apache.hbase:hbase-server - org.apache.hbase:hbase-annotations org.apache.hbase.thirdparty:hbase-shaded-protobuf org.apache.hbase.thirdparty:hbase-shaded-netty org.apache.hbase.thirdparty:hbase-shaded-miscellaneous diff --git a/packaging/hudi-utilities-slim-bundle/pom.xml b/packaging/hudi-utilities-slim-bundle/pom.xml index 0803b1f143662..60f0af9d64f07 100644 --- a/packaging/hudi-utilities-slim-bundle/pom.xml +++ b/packaging/hudi-utilities-slim-bundle/pom.xml @@ -112,7 +112,6 @@ org.apache.httpcomponents:httpcore org.apache.httpcomponents:fluent-hc org.antlr:stringtemplate - org.apache.parquet:parquet-avro com.github.davidmoten:guava-mini com.github.davidmoten:hilbert-curve @@ -394,13 +393,6 @@ ${project.version} - - - org.apache.parquet - parquet-avro - compile - - ${hive.groupid} diff --git a/pom.xml b/pom.xml index 570259b65d2a3..7caff57f066b4 100644 --- a/pom.xml +++ b/pom.xml @@ -1570,8 +1570,6 @@ ${scala12.version} 2.12 - true - true @@ -1613,6 +1611,9 @@ hudi-spark-datasource/hudi-spark2 hudi-spark-datasource/hudi-spark2-common + + true + true @@ -1631,6 +1632,7 @@ 2.4 + true diff --git a/rfc/README.md b/rfc/README.md index 0009a1b72b953..532f38fc5dce3 100644 --- a/rfc/README.md +++ b/rfc/README.md @@ -72,16 +72,19 @@ The list of all RFCs can be found here. | 34 | [Hudi BigQuery Integration](./rfc-34/rfc-34.md) | `COMPLETED` | | 35 | [Make Flink MOR table writing streaming friendly](https://cwiki.apache.org/confluence/display/HUDI/RFC-35%3A+Make+Flink+MOR+table+writing+streaming+friendly)| `UNDER REVIEW` | | 36 | [HUDI Metastore Server](https://cwiki.apache.org/confluence/display/HUDI/%5BWIP%5D+RFC-36%3A+HUDI+Metastore+Server)| `UNDER REVIEW` | -| 37 | [Hudi Metadata based Bloom Index](rfc-37/rfc-37.md) | `IN PROGRESS` | +| 37 | [Hudi Metadata based Bloom Index](rfc-37/rfc-37.md) | `ONGOING` | | 38 | [Spark Datasource V2 Integration](./rfc-38/rfc-38.md) | `IN PROGRESS` | | 39 | [Incremental source for Debezium](./rfc-39/rfc-39.md) | `ONGOING` | -| 40 | [Hudi Connector for Trino](./rfc-40/rfc-40.md) | `ONGOING` | +| 40 | [Hudi Connector for Trino](./rfc-40/rfc-40.md) | `IN PROGRESS` | | 41 | [Hudi Snowflake Integration] | `UNDER REVIEW`| | 42 | [Consistent Hashing Index](./rfc-42/rfc-42.md) | `IN PROGRESS` | | 43 | [Compaction / Clustering Service](./rfc-43/rfc-43.md) | `UNDER REVIEW` | -| 44 | [Hudi Connector for Presto](./rfc-44/rfc-44.md) | `UNDER REVIEW` | +| 44 | [Hudi Connector for Presto](./rfc-44/rfc-44.md) | `IN PROGRESS` | | 45 | [Asynchronous Metadata Indexing](./rfc-45/rfc-45.md) | `ONGOING` | | 46 | [Optimizing Record Payload Handling](./rfc-46/rfc-46.md) | `UNDER REVIEW` | | 47 | [Add Call Produce Command for Spark SQL](./rfc-47/rfc-47.md) | `UNDER REVIEW` | | 48 | [LogCompaction for MOR tables](./rfc-48/rfc-48.md) | `UNDER REVIEW` | | 49 | [Support sync with DataHub](./rfc-49/rfc-49.md) | `ONGOING` | +| 50 | [Improve Timeline Server](./rfc-50/rfc-50.md) | `UNDER REVIEW` | +| 51 | [Change Data Capture](./rfc-51/rfc-51.md) | `UNDER REVIEW` | +| 52 | [Introduce Secondary Index to Improve HUDI Query Performance](./rfc-52/rfc-52.md) | `UNDER REVIEW` | diff --git a/rfc/rfc-44/presto-connector.png b/rfc/rfc-44/presto-connector.png new file mode 100644 index 0000000000000..21248e0f1ca01 Binary files /dev/null and b/rfc/rfc-44/presto-connector.png differ diff --git a/rfc/rfc-44/rfc-44.md b/rfc/rfc-44/rfc-44.md new file mode 100644 index 0000000000000..a55cce46ec684 --- /dev/null +++ b/rfc/rfc-44/rfc-44.md @@ -0,0 +1,158 @@ + + +# RFC-44: Hudi Connector for Presto + +## Proposers + +- @7c00 + +## Approvers + +- @codope +- @vinothchandar + +## Status + +JIRA: [HUDI-3210](https://issues.apache.org/jira/browse/HUDI-3210) + +> Please keep the status updated in `rfc/README.md`. + +## Abstract + +The support for querying Hudi tables in Presto is provided by Presto Hive connector. The implementation is built on the +InputFormat interface from hudi-hadoop-mr module. This approach has known performance and stability issues. It's also +hard to adopt new Hudi features due to the restrictions from current code. A separate Hudi connector would make it +efficient to make specific optimization, add new functionalities, integrate advanced features, and evolve rapidly with +the upstream project. + +## Background + +The current Presto integration reads Hudi tables as regular Hive tables with some additional processing. For a COW or +MOR-RO table, Presto applies a dedicated file filter (`HoodieROTablePathFilter`) to prune invalid files during split +generation. For an MOR-RT table, Presto delegates split generation to `HoodieParquetRealtimeInputFormat#getSplits`, +and data loading to the reader created by `HoodieParquetRealtimeInputFormat#getRecordReader`. + +This implementation could take advantage of existing code. But there are some drawbacks. + +- Due to the MVCC design, file layouts of Hudi tables are quite different from those of regular Hive tables. + Mixing the split generation for Hudi and non-Hudi tables makes it hard to extend to integrate custom split scheduling + strategies. For example, HoodieParquetRealtimeInputFormat generates a split for a file slice, which is not performant + and could be improved by fine-grained splitting. +- In order to transport Hudi-only split properties, CustomSplitConverter is applied. The restrictions from current code + make it necessary to have such hacking and tricky code, and increases the difficulty of testing. It would continue to + affect the new code to adopt new Hudi features in the future. +- By delegating most work to HoodieParquetRealtimeInputFormat, memory usage is out of control of Presto memory + management mechanism. That hurts the system robust: workers tend to crash for OOM when querying a large MOR-RT table. + +With a new separator connector, Hudi integration could be improved without the restriction of current code in Hive +connector. A separate connector also separates Hudi's bugs away from Hive connector. It becomes more confident to add +new code, since it would never break Hive connector. + +## Implementation + +Presto provides a set of service provider interfaces (SPI) to allow developers to create plugins that would be +dynamically loaded into Presto runtime system to add custom functionalities. The connector is a kind of plugin, +which can read and sink data from/to external data sources. + +To create a connector for Hudi, SPIs below are to be implements: + +- **Plugin**: the entry class to create plugin components (including connectors), instantiated and invoked by Presto + internal services when booting. `HudiPlugin` which implements Plugin interface is to be added for Hudi connector. +- **ConnectorFactory**: the factory class to create connector instances. `HudiConnectorFactory` which implements + ConnectorFactory is to be added for Hudi connector. +- **Connector**: the facade class of all service classes for a connector. `HudiConnector` which implements Connector + is to be added for Hudi connector. Primary service classes for Hudi connector are listed below. + - **ConnectorMetadata**: the service class to retrieve (and update if possible) metadata from/to a data source. + `HudiMetadata` which implements ConnectorMetadata is to be added to Hudi connector. + - **ConnectorSplitManager**: the service class to generate ConnectorSplit for accessing the data source. + A ConnectorSplit is similar to a Split in Hadoop MapReduce computation paradigm. `HudiConnectorSplitManager` + which implements ConnectorSplitManager, is to be added to Hudi connector. + - **ConnectorPageSourceProvider**: the service class to generate a reader to load data from the data source. + `HudiPageSourceProvider` which implements ConnectorPageSourceProvider, is to be added to Hudi connector. + +There are other service classes (e.g. `ConnectorPageSinkProvider`), which are not covered in this RFC but might be +implemented in the future. A class-diagrammatic view of the different components is shown below. + +![](presto-connector.png) + +### HudiMetadata + +HudiMetadata implements ConnectorMetadata interface, and provides methods to access the metadata like databases +(called schemas in Presto), table and column structures, statistics and properties, and other info. +As Hudi table metadata is synchronized to Hive MetaStore, HudiMetadata reuses ExtendHiveMetastore from Presto codebase +to retrieve Hudi table metadata. Besides, HudiMetadata takes use of HoodieTableMetaClient to expose Hudi specified +metadata, e.g. instants list of a table timeline, represented as SystemTable. + +### HudiSplitManager & HudiSplit + +HudiSplitManager implements the ConnectorSplitManager interface. It partitions a Hudi table to read into multiple +individual chunks (called ConnectorSplit in Presto), so that the data set can be processed in parallel. +HudiSplitManager also performs partition pruning if possible. + +HudiSplit, which implements ConnectorSplit, describes which files to read, and the file properties (i.e. offset, length, +location, etc). For a normal Hudi table, HudiSplitManager generates a split for each FileSlice at certain instant. +For a Hudi table that contains large base files, HudiSplitManager divides the base file into multiple splits, +each containing all the log files and part of the base file. This should improve the performance. + +For large Hudi tables, generating splits tends to cost lots of time. This can be improved in the following ways. +Firstly, increase parallelism. Since each partition is independent of another, processing the partitions in multiple +threads, one thread per partition, helps reduce the wall time. Secondly, take use of Hudi metadata table or external +listing services. This should save a large amount of cost to make requests to HDFS. + + +### HudiPageSourceProvider & HudiPageSource + +For each HudiSplit, HudiPageSourceProvider generates a reader for it to load data from the data source (e.g. HCFS) +into memory. Presto organizes the memory data in the form of Pages. Page is implemented in a column-store favor. +This design contributes to enhance Presto's query performance. In particular, Presto writes its own readers to +accelerate the data loading for column-store favored files, such as Parquet and ORC. + +For the snapshot query on a COW table, HudiPageSourceProvider takes advantage of Presto's native ParquetPageSource +as it's well optimized for Parquet files as described above. + +For the snapshot query on an MOR table, HudiPageSourceProvider is planned to add a page source that makes mixed uses of +ParquetPageSource and HoodieRealtimeRecordReader. The new page source takes care of memory usage under Presto's memory +tracking framework. A native log file reader optimized for Presto might be provided in the future. + +## Rollout/Adoption Plan + +- What impact (if any) will there be on existing users? + + There will be no impact on existing users because this is a new connector. It does not change the behavior of current + integration through the existing Hive connector. It gives users more choice. However, in order to use this connector, the catalog name should be changed to `hudi` from `hive`. + For example, after users have configured Hudi connector, then `USE hudi.schema_name` should be used instead of `USE hive.schema_name`. + +- What do we lose if we move away from the Hive connector? + + Some features in Hive connectors (e.g. RaptorX) might be temporarily unavailable during the early development stage of + Hudi connector. Most of them will be ported to Hudi connector as the connector evolution. + +- If we need special migration tools, describe them here. + + The Hudi connector provides another choice besides Hive connector for people to query Hudi tables from Presto. + No migration is needed. + +- When will we remove the existing behavior? + + We are not proposing to remove the existing behavior. We hope that we will have a critical mass of users who will like + to use the new Hudi connector. That said, we should continue to support the current integration. + +## Test Plan + +- [x] POC for snapshot query on COW table +- [ ] Unit tests for the connector +- [ ] Product integration tests +- [ ] Benchmark snapshot query for large tables \ No newline at end of file diff --git a/scripts/release/deploy_staging_jars.sh b/scripts/release/deploy_staging_jars.sh index 76f018e2025e9..fa71faae80d6d 100755 --- a/scripts/release/deploy_staging_jars.sh +++ b/scripts/release/deploy_staging_jars.sh @@ -37,10 +37,14 @@ if [ "$#" -gt "1" ]; then fi declare -a ALL_VERSION_OPTS=( +"-Dscala-2.11 -Dspark2 -Dflink1.13" # for legacy bundle name +"-Dscala-2.12 -Dspark2 -Dflink1.13" # for legacy bundle name +"-Dscala-2.12 -Dspark3 -Dflink1.14" # for legacy bundle name "-Dscala-2.11 -Dspark2.4 -Dflink1.13" +"-Dscala-2.11 -Dspark2.4 -Dflink1.14" "-Dscala-2.12 -Dspark2.4 -Dflink1.13" -"-Dscala-2.12 -Dspark3.1 -Dflink1.14" "-Dscala-2.12 -Dspark3.2 -Dflink1.14" +"-Dscala-2.12 -Dspark3.1 -Dflink1.14" # run this last to make sure utilities bundle has spark 3.1 ) printf -v joined "'%s'\n" "${ALL_VERSION_OPTS[@]}" diff --git a/scripts/release/validate_staged_bundles.sh b/scripts/release/validate_staged_bundles.sh new file mode 100755 index 0000000000000..b210f39fbfaeb --- /dev/null +++ b/scripts/release/validate_staged_bundles.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# fail immediately +set -o errexit +set -o nounset + +REPO=$1 +VERSION=$2 + +STAGING_REPO="https://repository.apache.org/content/repositories/${REPO}/org/apache/hudi" + +declare -a BUNDLE_URLS=( +"${STAGING_REPO}/hudi-datahub-sync-bundle/${VERSION}/hudi-datahub-sync-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-flink1.13-bundle_2.11/${VERSION}/hudi-flink1.13-bundle_2.11-${VERSION}.jar" +"${STAGING_REPO}/hudi-flink1.13-bundle_2.12/${VERSION}/hudi-flink1.13-bundle_2.12-${VERSION}.jar" +"${STAGING_REPO}/hudi-flink1.14-bundle_2.11/${VERSION}/hudi-flink1.14-bundle_2.11-${VERSION}.jar" +"${STAGING_REPO}/hudi-flink1.14-bundle_2.12/${VERSION}/hudi-flink1.14-bundle_2.12-${VERSION}.jar" +"${STAGING_REPO}/hudi-gcp-bundle/${VERSION}/hudi-gcp-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-hadoop-mr-bundle/${VERSION}/hudi-hadoop-mr-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-hive-sync-bundle/${VERSION}/hudi-hive-sync-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-integ-test-bundle/${VERSION}/hudi-integ-test-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-kafka-connect-bundle/${VERSION}/hudi-kafka-connect-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-presto-bundle/${VERSION}/hudi-presto-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-spark-bundle_2.11/${VERSION}/hudi-spark-bundle_2.11-${VERSION}.jar" +"${STAGING_REPO}/hudi-spark-bundle_2.12/${VERSION}/hudi-spark-bundle_2.12-${VERSION}.jar" +"${STAGING_REPO}/hudi-spark2.4-bundle_2.11/${VERSION}/hudi-spark2.4-bundle_2.11-${VERSION}.jar" +"${STAGING_REPO}/hudi-spark2.4-bundle_2.12/${VERSION}/hudi-spark2.4-bundle_2.12-${VERSION}.jar" +"${STAGING_REPO}/hudi-spark3-bundle_2.12/${VERSION}/hudi-spark3-bundle_2.12-${VERSION}.jar" +"${STAGING_REPO}/hudi-spark3.1-bundle_2.12/${VERSION}/hudi-spark3.1-bundle_2.12-${VERSION}.jar" +"${STAGING_REPO}/hudi-spark3.2-bundle_2.12/${VERSION}/hudi-spark3.2-bundle_2.12-${VERSION}.jar" +"${STAGING_REPO}/hudi-timeline-server-bundle/${VERSION}/hudi-timeline-server-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-trino-bundle/${VERSION}/hudi-trino-bundle-${VERSION}.jar" +"${STAGING_REPO}/hudi-utilities-bundle_2.11/${VERSION}/hudi-utilities-bundle_2.11-${VERSION}.jar" +"${STAGING_REPO}/hudi-utilities-bundle_2.12/${VERSION}/hudi-utilities-bundle_2.12-${VERSION}.jar" +"${STAGING_REPO}/hudi-utilities-slim-bundle_2.11/${VERSION}/hudi-utilities-slim-bundle_2.11-${VERSION}.jar" +"${STAGING_REPO}/hudi-utilities-slim-bundle_2.12/${VERSION}/hudi-utilities-slim-bundle_2.12-${VERSION}.jar" +) + +NOW=$(date +%s) +TMP_DIR_FOR_BUNDLES=/tmp/${NOW} +mkdir "$TMP_DIR_FOR_BUNDLES" +for url in "${BUNDLE_URLS[@]}"; do + echo "downloading $url" + wget "$url" -P "$TMP_DIR_FOR_BUNDLES" +done + +ls -l "$TMP_DIR_FOR_BUNDLES" diff --git a/scripts/release/validate_staged_release.sh b/scripts/release/validate_staged_release.sh index 0e027442570a5..41295edfee614 100755 --- a/scripts/release/validate_staged_release.sh +++ b/scripts/release/validate_staged_release.sh @@ -78,9 +78,8 @@ pushd $WORK_DIR # Checkout dist repo LOCAL_SVN_DIR=local_svn_dir -ROOT_SVN_URL=https://dist.apache.org/repos/dist/ +ROOT_SVN_URL=https://dist.apache.org/repos/dist REPO_TYPE=${RELEASE_TYPE} -#RELEASE_REPO=release HUDI_REPO=hudi if [ $RC_NUM == -1 ]; then @@ -99,7 +98,15 @@ echo "Downloading from svn co ${ROOT_SVN_URL}/${REPO_TYPE}/${HUDI_REPO}" (bash -c "svn co ${ROOT_SVN_URL}/${REPO_TYPE}/${HUDI_REPO} $REDIRECT") || (echo -e "\t\t Unable to checkout ${ROOT_SVN_URL}/${REPO_TYPE}/${HUDI_REPO} to $REDIRECT. Please run with --verbose to get details\n" && exit -1) echo "Validating hudi-${ARTIFACT_SUFFIX} with release type \"${REPO_TYPE}\"" -cd ${HUDI_REPO}/hudi-${ARTIFACT_SUFFIX} +if [ $RELEASE_TYPE == "release" ]; then + ARTIFACT_PREFIX= +elif [ $RELEASE_TYPE == "dev" ]; then + ARTIFACT_PREFIX='hudi-' +else + echo "Unexpected RELEASE_TYPE: $RELEASE_TYPE" + exit 1; +fi +cd ${HUDI_REPO}/${ARTIFACT_PREFIX}${ARTIFACT_SUFFIX} $SHASUM hudi-${ARTIFACT_SUFFIX}.src.tgz > got.sha512 echo "Checking Checksum of Source Release"