diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml index ebf3caccd9c62..20515f7c750ed 100644 --- a/.github/workflows/bot.yml +++ b/.github/workflows/bot.yml @@ -18,12 +18,16 @@ jobs: include: - scala: "scala-2.11" spark: "spark2" + skipModules: "" - scala: "scala-2.11" spark: "spark2,spark-shade-unbundle-avro" + skipModules: "" - scala: "scala-2.12" spark: "spark3.1.x" + skipModules: "!hudi-spark-datasource/hudi-spark3" - scala: "scala-2.12" spark: "spark3.1.x,spark-shade-unbundle-avro" + skipModules: "!hudi-spark-datasource/hudi-spark3" - scala: "scala-2.12" spark: "spark3" - scala: "scala-2.12" @@ -40,4 +44,5 @@ jobs: env: SCALA_PROFILE: ${{ matrix.scala }} SPARK_PROFILE: ${{ matrix.spark }} - run: mvn install -P "$SCALA_PROFILE,$SPARK_PROFILE" -DskipTests=true -Dmaven.javadoc.skip=true -B -V + SKIP_MODULES: ${{ matrix.skipModules }} + run: mvn install -P "$SCALA_PROFILE,$SPARK_PROFILE" -pl "$SKIP_MODULES" -DskipTests=true -Dmaven.javadoc.skip=true -B -V diff --git a/README.md b/README.md index af11e6a14d5df..6d3475755ff87 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ mvn clean package -DskipTests -Dscala-2.12 The default Spark version supported is 2.4.4. To build for different Spark 3 versions, use the corresponding profile ``` -# Build against Spark 3.2.0 (the default build shipped with the public Spark 3 bundle) +# Build against Spark 3.2.1 (the default build shipped with the public Spark 3 bundle) mvn clean package -DskipTests -Dspark3 # Build against Spark 3.1.2 diff --git a/doap_HUDI.rdf b/doap_HUDI.rdf index 33f64ecf82ecb..a3b958a5cd7d0 100644 --- a/doap_HUDI.rdf +++ b/doap_HUDI.rdf @@ -81,6 +81,11 @@ 2021-12-08 0.10.0 + + Apache Hudi 0.10.1 + 2022-01-26 + 0.10.1 + diff --git a/docker/demo/config/test-suite/cow-spark-long-running.yaml b/docker/demo/config/test-suite/cow-spark-long-running.yaml index 493ad7a5578f6..8a1e58f840a37 100644 --- a/docker/demo/config/test-suite/cow-spark-long-running.yaml +++ b/docker/demo/config/test-suite/cow-spark-long-running.yaml @@ -13,13 +13,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -dag_name: cow-spark-long-running-multi-partitions.yaml -dag_rounds: 50 -dag_intermittent_delay_mins: 1 +dag_name: cow-spark-deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 30 +dag_intermittent_delay_mins: 0 dag_content: first_insert: config: - record_size: 1000 + record_size: 200 num_partitions_insert: 50 repeat_count: 1 num_records_insert: 10000 @@ -33,12 +33,12 @@ dag_content: deps: first_insert first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_upsert: config: - record_size: 1000 + record_size: 200 num_partitions_insert: 50 num_records_insert: 300 repeat_count: 1 @@ -60,13 +60,13 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: true + validate_hive: false delete_input_data: true type: ValidateDatasetNode deps: second_hive_sync last_validate: config: - execute_itr_count: 50 + execute_itr_count: 30 validate_clean: true validate_archival: true type: ValidateAsyncOperations diff --git a/docker/demo/config/test-suite/cow-spark-simple.yaml b/docker/demo/config/test-suite/cow-spark-simple.yaml index 21e7e6bbe39bc..0859c63200203 100644 --- a/docker/demo/config/test-suite/cow-spark-simple.yaml +++ b/docker/demo/config/test-suite/cow-spark-simple.yaml @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. dag_name: cow-spark-simple.yaml -dag_rounds: 2 +dag_rounds: 1 dag_intermittent_delay_mins: 1 dag_content: first_insert: @@ -33,7 +33,7 @@ dag_content: deps: first_insert first_validate: config: - validate_hive: true + validate_hive: false type: ValidateDatasetNode deps: first_hive_sync first_upsert: @@ -60,7 +60,7 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: true + validate_hive: false delete_input_data: false type: ValidateDatasetNode deps: second_hive_sync \ No newline at end of file diff --git a/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-hive.yaml similarity index 96% rename from docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml rename to docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-hive.yaml index 0ce529805567b..324a4b4a6d0d5 100644 --- a/docker/demo/config/test-suite/cow-long-running-multi-partitions.yaml +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions-hive.yaml @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -dag_name: cow-long-running-multi-partitions.yaml +dag_name: deltastreamer-long-running-multi-partitions.yaml dag_rounds: 50 dag_intermittent_delay_mins: 1 dag_content: @@ -76,7 +76,7 @@ dag_content: deps: first_delete second_validate: config: - validate_hive: false + validate_hive: true delete_input_data: true type: ValidateDatasetNode deps: second_hive_sync diff --git a/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml new file mode 100644 index 0000000000000..9d2766f1a5a7e --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-long-running-multi-partitions.yaml @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 50 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 2 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: third_insert + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: false + delete_input_data: true + type: ValidateDatasetNode + deps: second_hive_sync + last_validate: + config: + execute_itr_count: 50 + validate_clean: true + validate_archival: true + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml b/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml new file mode 100644 index 0000000000000..2fc4961e15c07 --- /dev/null +++ b/docker/demo/config/test-suite/deltastreamer-medium-full-dataset-validation.yaml @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# to be used with test-aggressive-clean-archival.properties + +dag_name: deltastreamer-long-running-multi-partitions.yaml +dag_rounds: 20 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 2 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: third_insert + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: second_hive_sync + last_validate: + config: + execute_itr_count: 20 + validate_clean: true + validate_archival: true + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/cow-long-running-example.yaml b/docker/demo/config/test-suite/detlastreamer-long-running-example.yaml similarity index 97% rename from docker/demo/config/test-suite/cow-long-running-example.yaml rename to docker/demo/config/test-suite/detlastreamer-long-running-example.yaml index 29b6858bf0506..28578eb9b687e 100644 --- a/docker/demo/config/test-suite/cow-long-running-example.yaml +++ b/docker/demo/config/test-suite/detlastreamer-long-running-example.yaml @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -dag_name: cow-long-running-example.yaml +dag_name: detlastreamer-long-running-example.yaml dag_rounds: 50 dag_intermittent_delay_mins: 1 dag_content: diff --git a/docker/demo/config/test-suite/insert-overwrite-table.yaml b/docker/demo/config/test-suite/insert-overwrite-table.yaml new file mode 100644 index 0000000000000..8b5a26e4683b7 --- /dev/null +++ b/docker/demo/config/test-suite/insert-overwrite-table.yaml @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-deltastreamer.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: first_insert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: first_upsert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: second_insert + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: second_upsert + first_insert_overwrite_table: + config: + record_size: 1000 + repeat_count: 10 + num_records_insert: 10 + type: SparkInsertOverwriteTableNode + deps: first_hive_sync + delete_all_input_except_last: + config: + delete_input_data_except_latest: true + type: DeleteInputDatasetNode + deps: first_insert_overwrite_table + third_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: delete_all_input_except_last + third_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: third_insert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_upsert + second_validate: + config: + validate_full_data : true + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/insert-overwrite.yaml b/docker/demo/config/test-suite/insert-overwrite.yaml new file mode 100644 index 0000000000000..f2299c50c08f3 --- /dev/null +++ b/docker/demo/config/test-suite/insert-overwrite.yaml @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-deltastreamer.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: first_insert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: first_upsert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: second_insert + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: second_upsert + first_insert_overwrite: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10 + type: SparkInsertOverwriteNode + deps: first_hive_sync + delete_all_input_except_last: + config: + delete_input_data_except_latest: true + type: DeleteInputDatasetNode + deps: first_insert_overwrite + third_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: delete_all_input_except_last + third_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: third_insert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_upsert + second_validate: + config: + validate_full_data : true + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/simple-clustering-hive.yaml b/docker/demo/config/test-suite/simple-clustering-hive.yaml new file mode 100644 index 0000000000000..e1f79bfe93c0f --- /dev/null +++ b/docker/demo/config/test-suite/simple-clustering-hive.yaml @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-clustering-hive.yaml +dag_rounds: 30 +dag_intermittent_delay_mins: 0 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 9000 + type: DeleteNode + deps: third_insert + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_hive_sync + first_cluster: + config: + execute_itr_count: 20 + type: ClusteringNode + deps: first_validate + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_cluster + second_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/cow-clustering-example.yaml b/docker/demo/config/test-suite/simple-clustering.yaml similarity index 96% rename from docker/demo/config/test-suite/cow-clustering-example.yaml rename to docker/demo/config/test-suite/simple-clustering.yaml index 95932317c04fd..7389ee3ebc34b 100644 --- a/docker/demo/config/test-suite/cow-clustering-example.yaml +++ b/docker/demo/config/test-suite/simple-clustering.yaml @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -dag_name: cow-clustering-example.yaml -dag_rounds: 3 +dag_name: simple-clustering.yaml +dag_rounds: 30 dag_intermittent_delay_mins: 0 dag_content: first_insert: @@ -60,7 +60,7 @@ dag_content: deps: first_hive_sync first_cluster: config: - execute_itr_count: 2 + execute_itr_count: 25 type: ClusteringNode deps: first_validate second_hive_sync: diff --git a/docker/demo/config/test-suite/simple-deltastreamer-hive.yaml b/docker/demo/config/test-suite/simple-deltastreamer-hive.yaml new file mode 100644 index 0000000000000..e6738b6942b35 --- /dev/null +++ b/docker/demo/config/test-suite/simple-deltastreamer-hive.yaml @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: simple-deltastreamer.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 1 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_validate: + config: + validate_hive: false + type: ValidateDatasetNode + deps: first_hive_sync + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 1 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 1 + num_records_delete: 2000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: true + delete_input_data: true + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/demo/config/test-suite/complex-dag-cow.yaml b/docker/demo/config/test-suite/simple-deltastreamer.yaml similarity index 98% rename from docker/demo/config/test-suite/complex-dag-cow.yaml rename to docker/demo/config/test-suite/simple-deltastreamer.yaml index 3a84b0a0acecd..f49a41baf8541 100644 --- a/docker/demo/config/test-suite/complex-dag-cow.yaml +++ b/docker/demo/config/test-suite/simple-deltastreamer.yaml @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -dag_name: complex-dag-cow.yaml +dag_name: simple-deltastreamer.yaml dag_rounds: 1 dag_intermittent_delay_mins: 1 dag_content: diff --git a/docker/demo/config/test-suite/spark-clustering.yaml b/docker/demo/config/test-suite/spark-clustering.yaml new file mode 100644 index 0000000000000..e8e722ca77c7c --- /dev/null +++ b/docker/demo/config/test-suite/spark-clustering.yaml @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +dag_name: cow-spark-simple.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: 1 +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: none + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: first_insert + second_insert: + config: + record_size: 1000 + num_partitions_insert: 10 + repeat_count: 1 + num_records_insert: 10000 + type: SparkInsertNode + deps: first_upsert + second_upsert: + config: + record_size: 1000 + num_partitions_insert: 10 + num_records_insert: 1000 + repeat_count: 1 + num_records_upsert: 8000 + num_partitions_upsert: 10 + type: SparkUpsertNode + deps: second_insert + first_delete: + config: + num_partitions_delete: 10 + num_records_delete: 16000 + type: SparkDeleteNode + deps: second_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: false + delete_input_data: false + type: ValidateDatasetNode + deps: second_hive_sync \ No newline at end of file diff --git a/docker/demo/config/test-suite/test-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-aggressive-clean-archival.properties new file mode 100644 index 0000000000000..dcbbfb31c9936 --- /dev/null +++ b/docker/demo/config/test-suite/test-aggressive-clean-archival.properties @@ -0,0 +1,54 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.cleaner.commits.retained=5 +hoodie.keep.min.commits=9 +hoodie.keep.max.commits=10 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties new file mode 100644 index 0000000000000..abddd77ba327a --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-aggressive-clean-archival.properties @@ -0,0 +1,61 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.cleaner.commits.retained=5 +hoodie.keep.min.commits=9 +hoodie.keep.max.commits=10 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties new file mode 100644 index 0000000000000..931b1e3a09668 --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering-metadata-aggressive-clean-archival.properties @@ -0,0 +1,63 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.cleaner.commits.retained=5 +hoodie.keep.min.commits=9 +hoodie.keep.max.commits=10 + +hoodie.metadata.enable=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-clustering.properties b/docker/demo/config/test-suite/test-clustering.properties new file mode 100644 index 0000000000000..9aa4843b2746e --- /dev/null +++ b/docker/demo/config/test-suite/test-clustering.properties @@ -0,0 +1,57 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.inline=true +hoodie.clustering.inline.max.commits=4 +hoodie.clustering.plan.strategy.sort.columns=_hoodie_partition_path,_row_key +hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 +hoodie.clustering.plan.strategy.small.file.limit=629145600 +hoodie.clustering.execution.strategy.class=org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties new file mode 100644 index 0000000000000..8935ffb4264be --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata-aggressive-clean-archival.properties @@ -0,0 +1,56 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.cleaner.commits.retained=5 +hoodie.keep.min.commits=9 +hoodie.keep.max.commits=10 + +hoodie.metadata.enable=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test-metadata.properties b/docker/demo/config/test-suite/test-metadata.properties new file mode 100644 index 0000000000000..48da77c511e93 --- /dev/null +++ b/docker/demo/config/test-suite/test-metadata.properties @@ -0,0 +1,56 @@ + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +hoodie.insert.shuffle.parallelism=100 +hoodie.upsert.shuffle.parallelism=100 +hoodie.bulkinsert.shuffle.parallelism=100 + +hoodie.metadata.enable=true + +hoodie.deltastreamer.source.test.num_partitions=100 +hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false +hoodie.deltastreamer.source.test.max_unique_records=100000000 +hoodie.embed.timeline.server=false +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector + +hoodie.deltastreamer.source.input.selector=org.apache.hudi.integ.testsuite.helpers.DFSTestSuitePathSelector +hoodie.datasource.hive_sync.skip_ro_suffix=true + +hoodie.datasource.write.recordkey.field=_row_key +hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator +hoodie.datasource.write.partitionpath.field=timestamp + +hoodie.clustering.plan.strategy.sort.columns=_row_key +hoodie.clustering.plan.strategy.daybased.lookback.partitions=0 +hoodie.clustering.inline.max.commits=1 + +hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input +hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc +hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP +hoodie.deltastreamer.keygen.timebased.output.dateformat=yyyy/MM/dd + +hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/ +hoodie.datasource.hive_sync.database=testdb +hoodie.datasource.hive_sync.table=table1 +hoodie.datasource.hive_sync.assume_date_partitioning=false +hoodie.datasource.hive_sync.partition_fields=_hoodie_partition_path +hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor + diff --git a/docker/demo/config/test-suite/test.properties b/docker/demo/config/test-suite/test.properties index 30cd1c1f02f09..509b9f4ba628e 100644 --- a/docker/demo/config/test-suite/test.properties +++ b/docker/demo/config/test-suite/test.properties @@ -19,6 +19,8 @@ hoodie.insert.shuffle.parallelism=100 hoodie.upsert.shuffle.parallelism=100 hoodie.bulkinsert.shuffle.parallelism=100 +hoodie.metadata.enable=false + hoodie.deltastreamer.source.test.num_partitions=100 hoodie.deltastreamer.source.test.datagen.use_rocksdb_for_storing_existing_keys=false hoodie.deltastreamer.source.test.max_unique_records=100000000 @@ -32,10 +34,6 @@ hoodie.datasource.write.recordkey.field=_row_key hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.TimestampBasedKeyGenerator hoodie.datasource.write.partitionpath.field=timestamp -hoodie.clustering.plan.strategy.sort.columns=_row_key -hoodie.clustering.plan.strategy.daybased.lookback.partitions=0 -hoodie.clustering.inline.max.commits=1 - hoodie.deltastreamer.source.dfs.root=/user/hive/warehouse/hudi-integ-test-suite/input hoodie.deltastreamer.schemaprovider.target.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc hoodie.deltastreamer.schemaprovider.source.schema.file=file:/var/hoodie/ws/docker/demo/config/test-suite/source.avsc diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java index 102fcc2ae7a63..1747a59f4f366 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ArchivedCommitsCommand.java @@ -30,6 +30,7 @@ import org.apache.hudi.common.table.log.HoodieLogFormat.Reader; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; import org.apache.avro.generic.GenericRecord; @@ -80,8 +81,7 @@ public String showArchivedCommits( // read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - List records = blk.getRecords(); - readRecords.addAll(records); + blk.getRecordItr().forEachRemaining(readRecords::add); } List readCommits = readRecords.stream().map(r -> (GenericRecord) r) .filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION) @@ -155,8 +155,9 @@ public String showCommits( // read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - List records = blk.getRecords(); - readRecords.addAll(records); + try (ClosableIterator recordItr = blk.getRecordItr()) { + recordItr.forEachRemaining(readRecords::add); + } } List readCommits = readRecords.stream().map(r -> (GenericRecord) r) .map(r -> readCommit(r, skipMetadata)).collect(Collectors.toList()); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java index 9adae1daa5336..4163f0cb5a6a4 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ClusteringCommand.java @@ -116,4 +116,40 @@ public String runClustering( } return "Succeeded to run clustering for " + clusteringInstantTime; } + + /** + * Run clustering table service. + *

+ * Example: + * > connect --path {path to hudi table} + * > clustering scheduleAndExecute --sparkMaster local --sparkMemory 2g + */ + @CliCommand(value = "clustering scheduleAndExecute", help = "Run Clustering. Make a cluster plan first and execute that plan immediately") + public String runClustering( + @CliOption(key = "sparkMaster", unspecifiedDefaultValue = SparkUtil.DEFAULT_SPARK_MASTER, help = "Spark master") final String master, + @CliOption(key = "sparkMemory", help = "Spark executor memory", unspecifiedDefaultValue = "4g") final String sparkMemory, + @CliOption(key = "parallelism", help = "Parallelism for hoodie clustering", unspecifiedDefaultValue = "1") final String parallelism, + @CliOption(key = "retry", help = "Number of retries", unspecifiedDefaultValue = "1") final String retry, + @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for " + + "hoodie client for compacting", unspecifiedDefaultValue = "") final String propsFilePath, + @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be " + + "passed here in the form of an array", unspecifiedDefaultValue = "") final String[] configs) throws Exception { + HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala()); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkCommand.CLUSTERING_SCHEDULE_AND_EXECUTE.toString(), master, sparkMemory, + client.getBasePath(), client.getTableConfig().getTableName(), parallelism, retry, propsFilePath); + UtilHelpers.validateAndAddProperties(configs, sparkLauncher); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + if (exitCode != 0) { + return "Failed to run clustering for scheduleAndExecute."; + } + return "Succeeded to run clustering for scheduleAndExecute"; + } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java index 9517234a0bb60..db1cd207df5a1 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CommitsCommand.java @@ -232,7 +232,9 @@ public String rollbackCommit(@CliOption(key = {"commit"}, help = "Commit to roll @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", - help = "Spark executor memory") final String sparkMemory) + help = "Spark executor memory") final String sparkMemory, + @CliOption(key = "rollbackUsingMarkers", unspecifiedDefaultValue = "true", + help = "Enabling marker based rollback") final String rollbackUsingMarkers) throws Exception { HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants(); @@ -243,7 +245,7 @@ public String rollbackCommit(@CliOption(key = {"commit"}, help = "Commit to roll SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), master, sparkMemory, instantTime, - HoodieCLI.getTableMetaClient().getBasePath()); + HoodieCLI.getTableMetaClient().getBasePath(), rollbackUsingMarkers); Process process = sparkLauncher.launch(); InputStreamConsumer.captureOutput(process); int exitCode = process.waitFor(); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java index b6a366bbb75ef..097c68a542c47 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/CompactionCommand.java @@ -264,6 +264,41 @@ public String compact( return "Compaction successfully completed for " + compactionInstantTime; } + @CliCommand(value = "compaction scheduleAndExecute", help = "Schedule compaction plan and execute this plan") + public String compact( + @CliOption(key = {"parallelism"}, mandatory = true, + help = "Parallelism for hoodie compaction") final String parallelism, + @CliOption(key = "schemaFilePath", mandatory = true, + help = "Path for Avro schema file") final String schemaFilePath, + @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "local", + help = "Spark Master") String master, + @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", + help = "Spark executor memory") final String sparkMemory, + @CliOption(key = "retry", unspecifiedDefaultValue = "1", help = "Number of retries") final String retry, + @CliOption(key = "propsFilePath", help = "path to properties file on localfs or dfs with configurations for hoodie client for compacting", + unspecifiedDefaultValue = "") final String propsFilePath, + @CliOption(key = "hoodieConfigs", help = "Any configuration that can be set in the properties file can be passed here in the form of an array", + unspecifiedDefaultValue = "") final String[] configs) + throws Exception { + HoodieTableMetaClient client = checkAndGetMetaClient(); + boolean initialized = HoodieCLI.initConf(); + HoodieCLI.initFS(initialized); + String sparkPropertiesPath = + Utils.getDefaultPropertiesFile(scala.collection.JavaConversions.propertiesAsScalaMap(System.getProperties())); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkCommand.COMPACT_SCHEDULE_AND_EXECUTE.toString(), master, sparkMemory, client.getBasePath(), + client.getTableConfig().getTableName(), parallelism, schemaFilePath, + retry, propsFilePath); + UtilHelpers.validateAndAddProperties(configs, sparkLauncher); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + if (exitCode != 0) { + return "Failed to schedule and execute compaction "; + } + return "Schedule and execute compaction successfully completed"; + } + /** * Prints all compaction details. */ diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java index 119ccb0dcf039..1d8d6dcd6ae93 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/ExportCommand.java @@ -34,14 +34,16 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.exception.HoodieException; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; + import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.avro.specific.SpecificData; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; @@ -69,8 +71,8 @@ public class ExportCommand implements CommandMarker { @CliCommand(value = "export instants", help = "Export Instants and their metadata from the Timeline") public String exportInstants( @CliOption(key = {"limit"}, help = "Limit Instants", unspecifiedDefaultValue = "-1") final Integer limit, - @CliOption(key = {"actions"}, help = "Comma seperated list of Instant actions to export", - unspecifiedDefaultValue = "clean,commit,deltacommit,rollback,savepoint,restore") final String filter, + @CliOption(key = {"actions"}, help = "Comma separated list of Instant actions to export", + unspecifiedDefaultValue = "clean,commit,deltacommit,rollback,savepoint,restore") final String filter, @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = {"localFolder"}, help = "Local Folder to export to", mandatory = true) String localFolder) throws Exception { @@ -122,44 +124,46 @@ private int copyArchivedInstants(List statuses, Set actionSe // read the avro blocks while (reader.hasNext() && copyCount < limit) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - for (IndexedRecord ir : blk.getRecords()) { - // Archived instants are saved as arvo encoded HoodieArchivedMetaEntry records. We need to get the - // metadata record from the entry and convert it to json. - HoodieArchivedMetaEntry archiveEntryRecord = (HoodieArchivedMetaEntry) SpecificData.get() - .deepCopy(HoodieArchivedMetaEntry.SCHEMA$, ir); - - final String action = archiveEntryRecord.get("actionType").toString(); - if (!actionSet.contains(action)) { - continue; - } - - GenericRecord metadata = null; - switch (action) { - case HoodieTimeline.CLEAN_ACTION: - metadata = archiveEntryRecord.getHoodieCleanMetadata(); + try (ClosableIterator recordItr = blk.getRecordItr()) { + while (recordItr.hasNext()) { + IndexedRecord ir = recordItr.next(); + // Archived instants are saved as arvo encoded HoodieArchivedMetaEntry records. We need to get the + // metadata record from the entry and convert it to json. + HoodieArchivedMetaEntry archiveEntryRecord = (HoodieArchivedMetaEntry) SpecificData.get() + .deepCopy(HoodieArchivedMetaEntry.SCHEMA$, ir); + final String action = archiveEntryRecord.get("actionType").toString(); + if (!actionSet.contains(action)) { + continue; + } + + GenericRecord metadata = null; + switch (action) { + case HoodieTimeline.CLEAN_ACTION: + metadata = archiveEntryRecord.getHoodieCleanMetadata(); + break; + case HoodieTimeline.COMMIT_ACTION: + case HoodieTimeline.DELTA_COMMIT_ACTION: + metadata = archiveEntryRecord.getHoodieCommitMetadata(); + break; + case HoodieTimeline.ROLLBACK_ACTION: + metadata = archiveEntryRecord.getHoodieRollbackMetadata(); + break; + case HoodieTimeline.SAVEPOINT_ACTION: + metadata = archiveEntryRecord.getHoodieSavePointMetadata(); + break; + case HoodieTimeline.COMPACTION_ACTION: + metadata = archiveEntryRecord.getHoodieCompactionMetadata(); + break; + default: + throw new HoodieException("Unknown type of action " + action); + } + + final String instantTime = archiveEntryRecord.get("commitTime").toString(); + final String outPath = localFolder + Path.SEPARATOR + instantTime + "." + action; + writeToFile(outPath, HoodieAvroUtils.avroToJson(metadata, true)); + if (++copyCount == limit) { break; - case HoodieTimeline.COMMIT_ACTION: - case HoodieTimeline.DELTA_COMMIT_ACTION: - metadata = archiveEntryRecord.getHoodieCommitMetadata(); - break; - case HoodieTimeline.ROLLBACK_ACTION: - metadata = archiveEntryRecord.getHoodieRollbackMetadata(); - break; - case HoodieTimeline.SAVEPOINT_ACTION: - metadata = archiveEntryRecord.getHoodieSavePointMetadata(); - break; - case HoodieTimeline.COMPACTION_ACTION: - metadata = archiveEntryRecord.getHoodieCompactionMetadata(); - break; - default: - throw new HoodieException("Unknown type of action " + action); - } - - final String instantTime = archiveEntryRecord.get("commitTime").toString(); - final String outPath = localFolder + Path.SEPARATOR + instantTime + "." + action; - writeToFile(outPath, HoodieAvroUtils.avroToJson(metadata, true)); - if (++copyCount == limit) { - break; + } } } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java index 27bcd81faefec..4a56858f3926a 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java @@ -37,6 +37,7 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieMemoryConfig; @@ -60,6 +61,7 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import scala.Tuple2; @@ -100,7 +102,7 @@ public String showLogFileCommits( while (reader.hasNext()) { HoodieLogBlock n = reader.next(); String instantTime; - int recordCount = 0; + AtomicInteger recordCount = new AtomicInteger(0); if (n instanceof HoodieCorruptBlock) { try { instantTime = n.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME); @@ -120,17 +122,19 @@ public String showLogFileCommits( instantTime = "dummy_instant_time_" + dummyInstantTimeCount; } if (n instanceof HoodieDataBlock) { - recordCount = ((HoodieDataBlock) n).getRecords().size(); + try (ClosableIterator recordItr = ((HoodieDataBlock) n).getRecordItr()) { + recordItr.forEachRemaining(r -> recordCount.incrementAndGet()); + } } } if (commitCountAndMetadata.containsKey(instantTime)) { commitCountAndMetadata.get(instantTime).add( - new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); + new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount.get())); } else { List, Map>, Integer>> list = new ArrayList<>(); list.add( - new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); + new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount.get())); commitCountAndMetadata.put(instantTime, list); } } @@ -232,11 +236,12 @@ public String showLogFileRecords( HoodieLogBlock n = reader.next(); if (n instanceof HoodieDataBlock) { HoodieDataBlock blk = (HoodieDataBlock) n; - List records = blk.getRecords(); - for (IndexedRecord record : records) { - if (allRecords.size() < limit) { - allRecords.add(record); - } + try (ClosableIterator recordItr = blk.getRecordItr()) { + recordItr.forEachRemaining(record -> { + if (allRecords.size() < limit) { + allRecords.add(record); + } + }); } } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MarkersCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MarkersCommand.java new file mode 100644 index 0000000000000..57a4ee1879855 --- /dev/null +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/MarkersCommand.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.utils.InputStreamConsumer; +import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.spark.launcher.SparkLauncher; +import org.springframework.shell.core.CommandMarker; +import org.springframework.shell.core.annotation.CliCommand; +import org.springframework.shell.core.annotation.CliOption; +import org.springframework.stereotype.Component; + +/** + * CLI command for marker options. + */ +@Component +public class MarkersCommand implements CommandMarker { + + @CliCommand(value = "marker delete", help = "Delete the marker") + public String deleteMarker(@CliOption(key = {"commit"}, help = "Delete a marker") final String instantTime, + @CliOption(key = {"sparkProperties"}, help = "Spark Properties File Path") final String sparkPropertiesPath, + @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, + @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "1G", + help = "Spark executor memory") final String sparkMemory) + throws Exception { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath); + sparkLauncher.addAppArgs(SparkMain.SparkCommand.DELETE_MARKER.toString(), master, sparkMemory, instantTime, + metaClient.getBasePath()); + Process process = sparkLauncher.launch(); + InputStreamConsumer.captureOutput(process); + int exitCode = process.waitFor(); + // Refresh the current + HoodieCLI.refreshTableMetadata(); + if (exitCode != 0) { + return String.format("Failed: Could not delete marker \"%s\".", instantTime); + } + return String.format("Marker \"%s\" deleted.", instantTime); + } +} diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java index 2533562d8206e..6c068c898b9be 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/RepairsCommand.java @@ -45,7 +45,6 @@ import org.springframework.stereotype.Component; import scala.collection.JavaConverters; -import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.List; @@ -153,10 +152,12 @@ public String overwriteHoodieProperties( HoodieTableMetaClient client = HoodieCLI.getTableMetaClient(); Properties newProps = new Properties(); - newProps.load(new FileInputStream(new File(overwriteFilePath))); + newProps.load(new FileInputStream(overwriteFilePath)); Map oldProps = client.getTableConfig().propsMap(); Path metaPathDir = new Path(client.getBasePath(), METAFOLDER_NAME); HoodieTableConfig.create(client.getFs(), metaPathDir, newProps); + // reload new props as checksum would have been added + newProps = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()).getTableConfig().getProps(); TreeSet allPropKeys = new TreeSet<>(); allPropKeys.addAll(newProps.keySet().stream().map(Object::toString).collect(Collectors.toSet())); diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java index d1ee109f59042..0de1a1adfe0be 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java @@ -18,6 +18,7 @@ package org.apache.hudi.cli.commands; +import org.apache.hadoop.fs.Path; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.cli.DeDupeType; import org.apache.hudi.cli.DedupeSparkJob; @@ -25,6 +26,7 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableVersion; @@ -38,7 +40,9 @@ import org.apache.hudi.exception.HoodieSavepointException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.constant.KeyGeneratorType; +import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.action.compact.strategy.UnBoundedCompactionStrategy; +import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper; import org.apache.hudi.table.upgrade.UpgradeDowngrade; import org.apache.hudi.utilities.HDFSParquetImporter; @@ -51,8 +55,6 @@ import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.deltastreamer.BootstrapExecutor; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; - -import org.apache.hadoop.fs.Path; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SQLContext; @@ -74,9 +76,9 @@ public class SparkMain { * Commands. */ enum SparkCommand { - BOOTSTRAP, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, + BOOTSTRAP, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, COMPACT_SCHEDULE_AND_EXECUTE, COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR, CLUSTERING_SCHEDULE, - CLUSTERING_RUN, CLEAN, DELETE_SAVEPOINT, UPGRADE, DOWNGRADE + CLUSTERING_RUN, CLUSTERING_SCHEDULE_AND_EXECUTE, CLEAN, DELETE_MARKER, DELETE_SAVEPOINT, UPGRADE, DOWNGRADE } public static void main(String[] args) throws Exception { @@ -92,8 +94,8 @@ public static void main(String[] args) throws Exception { try { switch (cmd) { case ROLLBACK: - assert (args.length == 5); - returnCode = rollback(jsc, args[3], args[4]); + assert (args.length == 6); + returnCode = rollback(jsc, args[3], args[4], Boolean.parseBoolean(args[5])); break; case DEDUPLICATE: assert (args.length == 8); @@ -128,7 +130,21 @@ public static void main(String[] args) throws Exception { configs.addAll(Arrays.asList(args).subList(9, args.length)); } returnCode = compact(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), args[7], - Integer.parseInt(args[8]), false, propsFilePath, configs); + Integer.parseInt(args[8]), HoodieCompactor.EXECUTE, propsFilePath, configs); + break; + case COMPACT_SCHEDULE_AND_EXECUTE: + assert (args.length >= 9); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[8])) { + propsFilePath = args[8]; + } + configs = new ArrayList<>(); + if (args.length > 9) { + configs.addAll(Arrays.asList(args).subList(8, args.length)); + } + + returnCode = compact(jsc, args[3], args[4], null, Integer.parseInt(args[5]), args[6], + Integer.parseInt(args[7]), HoodieCompactor.SCHEDULE_AND_EXECUTE, propsFilePath, configs); break; case COMPACT_SCHEDULE: assert (args.length >= 7); @@ -140,7 +156,7 @@ public static void main(String[] args) throws Exception { if (args.length > 7) { configs.addAll(Arrays.asList(args).subList(7, args.length)); } - returnCode = compact(jsc, args[3], args[4], args[5], 1, "", 0, true, propsFilePath, configs); + returnCode = compact(jsc, args[3], args[4], args[5], 1, "", 0, HoodieCompactor.SCHEDULE, propsFilePath, configs); break; case COMPACT_VALIDATE: assert (args.length == 7); @@ -176,7 +192,20 @@ public static void main(String[] args) throws Exception { configs.addAll(Arrays.asList(args).subList(9, args.length)); } returnCode = cluster(jsc, args[3], args[4], args[5], Integer.parseInt(args[6]), args[2], - Integer.parseInt(args[7]), false, propsFilePath, configs); + Integer.parseInt(args[7]), HoodieClusteringJob.EXECUTE, propsFilePath, configs); + break; + case CLUSTERING_SCHEDULE_AND_EXECUTE: + assert (args.length >= 8); + propsFilePath = null; + if (!StringUtils.isNullOrEmpty(args[7])) { + propsFilePath = args[7]; + } + configs = new ArrayList<>(); + if (args.length > 8) { + configs.addAll(Arrays.asList(args).subList(8, args.length)); + } + returnCode = cluster(jsc, args[3], args[4], null, Integer.parseInt(args[5]), args[2], + Integer.parseInt(args[6]), HoodieClusteringJob.SCHEDULE_AND_EXECUTE, propsFilePath, configs); break; case CLUSTERING_SCHEDULE: assert (args.length >= 7); @@ -189,7 +218,7 @@ public static void main(String[] args) throws Exception { configs.addAll(Arrays.asList(args).subList(7, args.length)); } returnCode = cluster(jsc, args[3], args[4], args[5], 1, args[2], - 0, true, propsFilePath, configs); + 0, HoodieClusteringJob.SCHEDULE, propsFilePath, configs); break; case CLEAN: assert (args.length >= 5); @@ -207,6 +236,10 @@ public static void main(String[] args) throws Exception { assert (args.length == 7); returnCode = createSavepoint(jsc, args[3], args[4], args[5], args[6]); break; + case DELETE_MARKER: + assert (args.length == 5); + returnCode = deleteMarker(jsc, args[3], args[4]); + break; case DELETE_SAVEPOINT: assert (args.length == 5); returnCode = deleteSavepoint(jsc, args[3], args[4]); @@ -250,6 +283,21 @@ protected static void clean(JavaSparkContext jsc, String basePath, String propsF new HoodieCleaner(cfg, jsc).run(); } + protected static int deleteMarker(JavaSparkContext jsc, String instantTime, String basePath) { + try { + SparkRDDWriteClient client = createHoodieClient(jsc, basePath); + HoodieWriteConfig config = client.getConfig(); + HoodieEngineContext context = client.getEngineContext(); + HoodieSparkTable table = HoodieSparkTable.create(config, context, true); + WriteMarkersFactory.get(config.getMarkersType(), table, instantTime) + .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); + return 0; + } catch (Exception e) { + LOG.warn(String.format("Failed: Could not clean marker instantTime: \"%s\".", instantTime), e); + return -1; + } + } + private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName, String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, int retry, String propsFilePath, List configs) { @@ -320,7 +368,7 @@ private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePat } private static int compact(JavaSparkContext jsc, String basePath, String tableName, String compactionInstant, - int parallelism, String schemaFile, int retry, boolean schedule, String propsFilePath, + int parallelism, String schemaFile, int retry, String mode, String propsFilePath, List configs) { HoodieCompactor.Config cfg = new HoodieCompactor.Config(); cfg.basePath = basePath; @@ -330,20 +378,20 @@ private static int compact(JavaSparkContext jsc, String basePath, String tableNa cfg.strategyClassName = UnBoundedCompactionStrategy.class.getCanonicalName(); cfg.parallelism = parallelism; cfg.schemaFile = schemaFile; - cfg.runSchedule = schedule; + cfg.runningMode = mode; cfg.propsFilePath = propsFilePath; cfg.configs = configs; return new HoodieCompactor(jsc, cfg).compact(retry); } private static int cluster(JavaSparkContext jsc, String basePath, String tableName, String clusteringInstant, - int parallelism, String sparkMemory, int retry, boolean schedule, String propsFilePath, List configs) { + int parallelism, String sparkMemory, int retry, String runningMode, String propsFilePath, List configs) { HoodieClusteringJob.Config cfg = new HoodieClusteringJob.Config(); cfg.basePath = basePath; cfg.tableName = tableName; cfg.clusteringInstantTime = clusteringInstant; cfg.parallelism = parallelism; - cfg.runSchedule = schedule; + cfg.runningMode = runningMode; cfg.propsFilePath = propsFilePath; cfg.configs = configs; jsc.getConf().set("spark.executor.memory", sparkMemory); @@ -394,8 +442,8 @@ private static int doBootstrap(JavaSparkContext jsc, String tableName, String ta return 0; } - private static int rollback(JavaSparkContext jsc, String instantTime, String basePath) throws Exception { - SparkRDDWriteClient client = createHoodieClient(jsc, basePath); + private static int rollback(JavaSparkContext jsc, String instantTime, String basePath, Boolean rollbackUsingMarkers) throws Exception { + SparkRDDWriteClient client = createHoodieClient(jsc, basePath, rollbackUsingMarkers); if (client.rollback(instantTime)) { LOG.info(String.format("The commit \"%s\" rolled back.", instantTime)); return 0; @@ -425,7 +473,7 @@ private static int rollbackToSavepoint(JavaSparkContext jsc, String savepointTim LOG.info(String.format("The commit \"%s\" rolled back.", savepointTime)); return 0; } catch (Exception e) { - LOG.warn(String.format("The commit \"%s\" failed to roll back.", savepointTime)); + LOG.warn(String.format("The commit \"%s\" failed to roll back.", savepointTime), e); return -1; } } @@ -437,7 +485,7 @@ private static int deleteSavepoint(JavaSparkContext jsc, String savepointTime, S LOG.info(String.format("Savepoint \"%s\" deleted.", savepointTime)); return 0; } catch (Exception e) { - LOG.warn(String.format("Failed: Could not delete savepoint \"%s\".", savepointTime)); + LOG.warn(String.format("Failed: Could not delete savepoint \"%s\".", savepointTime), e); return -1; } } @@ -452,11 +500,12 @@ private static int deleteSavepoint(JavaSparkContext jsc, String savepointTime, S * @throws Exception */ protected static int upgradeOrDowngradeTable(JavaSparkContext jsc, String basePath, String toVersion) { - HoodieWriteConfig config = getWriteConfig(basePath); + HoodieWriteConfig config = getWriteConfig(basePath, Boolean.parseBoolean(HoodieWriteConfig.ROLLBACK_USING_MARKERS_ENABLE.defaultValue())); HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(config.getBasePath()) .setLoadActiveTimelineOnLoad(false).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) - .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))).build(); + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig()).build(); try { new UpgradeDowngrade(metaClient, config, new HoodieSparkEngineContext(jsc), SparkUpgradeDowngradeHelper.getInstance()) .run(HoodieTableVersion.valueOf(toVersion), null); @@ -468,13 +517,18 @@ protected static int upgradeOrDowngradeTable(JavaSparkContext jsc, String basePa } } - private static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception { - HoodieWriteConfig config = getWriteConfig(basePath); + private static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, Boolean rollbackUsingMarkers) throws Exception { + HoodieWriteConfig config = getWriteConfig(basePath, rollbackUsingMarkers); return new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config); } - private static HoodieWriteConfig getWriteConfig(String basePath) { + private static SparkRDDWriteClient createHoodieClient(JavaSparkContext jsc, String basePath) throws Exception { + return createHoodieClient(jsc, basePath, Boolean.parseBoolean(HoodieWriteConfig.ROLLBACK_USING_MARKERS_ENABLE.defaultValue())); + } + + private static HoodieWriteConfig getWriteConfig(String basePath, Boolean rollbackUsingMarkers) { return HoodieWriteConfig.newBuilder().withPath(basePath) + .withRollbackUsingMarkers(rollbackUsingMarkers) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java index a95cc53df329c..5d58aa9d2e498 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/CommitUtil.java @@ -25,9 +25,6 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import java.io.IOException; -import java.text.ParseException; -import java.time.Instant; -import java.time.ZoneId; import java.time.ZonedDateTime; import java.util.Date; import java.util.List; @@ -53,16 +50,4 @@ public static String getTimeDaysAgo(int numberOfDays) { Date date = Date.from(ZonedDateTime.now().minusDays(numberOfDays).toInstant()); return HoodieActiveTimeline.formatDate(date); } - - /** - * Add hours to specified time. If hours <0, this acts as remove hours. - * example, say compactionCommitTime: "20200202020000" - * a) hours: +1, returns 20200202030000 - * b) hours: -1, returns 20200202010000 - */ - public static String addHours(String compactionCommitTime, int hours) throws ParseException { - Instant instant = HoodieActiveTimeline.parseDateFromInstantTime(compactionCommitTime).toInstant(); - ZonedDateTime commitDateTime = ZonedDateTime.ofInstant(instant, ZoneId.systemDefault()); - return HoodieActiveTimeline.formatDate(Date.from(commitDateTime.plusHours(hours).toInstant())); - } } diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkTempViewProvider.java b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkTempViewProvider.java index bbd8440448fd6..6f5a11ad6657f 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkTempViewProvider.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/utils/SparkTempViewProvider.java @@ -20,8 +20,6 @@ import org.apache.hudi.exception.HoodieException; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; @@ -31,27 +29,34 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; +import org.springframework.shell.support.logging.HandlerUtils; import java.util.List; +import java.util.logging.Handler; +import java.util.logging.Level; +import java.util.logging.Logger; import java.util.stream.Collectors; public class SparkTempViewProvider implements TempViewProvider { - private static final Logger LOG = LogManager.getLogger(SparkTempViewProvider.class); + private static final Logger LOG = HandlerUtils.getLogger(SparkTempViewProvider.class); private JavaSparkContext jsc; private SQLContext sqlContext; public SparkTempViewProvider(String appName) { try { + Handler handler = LOG.getParent().getHandlers()[0]; SparkConf sparkConf = new SparkConf().setAppName(appName) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[8]"); jsc = new JavaSparkContext(sparkConf); - jsc.setLogLevel("ERROR"); - sqlContext = new SQLContext(jsc); + if (handler != null) { + LOG.getParent().removeHandler(LOG.getParent().getHandlers()[0]); + LOG.getParent().addHandler(handler); + } } catch (Throwable ex) { // log full stack trace and rethrow. Without this its difficult to debug failures, if any - LOG.error("unable to initialize spark context ", ex); + LOG.log(Level.WARNING, "unable to initialize spark context ", ex); throw new HoodieException(ex); } } @@ -90,7 +95,7 @@ public void createOrReplace(String tableName, List headers, List entry : data.entrySet()) { String key = entry.getKey(); Integer[] value = entry.getValue(); @@ -279,8 +295,8 @@ public void testShowArchivedCommitsWithMultiCommitsFile() throws Exception { HoodieSparkTable table = HoodieSparkTable.create(cfg, context(), metaClient); // need to create multi archive files - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - archiveLog.archiveIfRequired(context()); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + archiver.archiveIfRequired(context()); } CommandResult cr = shell().executeCommand(String.format("commits showarchived --startTs %s --endTs %s", "160", "174")); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java index 21841a5769450..17c1002f6b0dd 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCompactionCommand.java @@ -24,7 +24,9 @@ import org.apache.hudi.cli.TableHeader; import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; import org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator; -import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.fs.NoOpConsistencyGuard; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -35,12 +37,13 @@ import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.testutils.CompactionTestUtils; +import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.table.HoodieSparkTable; -import org.apache.hudi.table.HoodieTimelineArchiveLog; +import org.apache.hudi.client.HoodieTimelineArchiver; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; @@ -152,7 +155,11 @@ private void generateCompactionInstances() throws IOException { activeTimeline.transitionCompactionInflightToComplete( new HoodieInstant(HoodieInstant.State.INFLIGHT, COMPACTION_ACTION, timestamp), Option.empty()); }); - + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + HoodieTestUtils.createCompactionCommitInMetadataTable(hadoopConf(), + new HoodieWrapperFileSystem( + FSUtils.getFs(tablePath, hadoopConf()), new NoOpConsistencyGuard()), tablePath, "007"); } private void generateArchive() throws IOException { @@ -162,13 +169,12 @@ private void generateArchive() throws IOException { .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() .withRemoteServerPort(timelineServicePort).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .forTable("test-trip-table").build(); // archive HoodieTableMetaClient metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); HoodieSparkTable table = HoodieSparkTable.create(cfg, context(), metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - archiveLog.archiveIfRequired(context()); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + archiver.archiveIfRequired(context()); } /** diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java index f2571ce3598d6..ee7fbda11b783 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestHoodieLogFileCommand.java @@ -108,7 +108,7 @@ public void init() throws IOException, InterruptedException, URISyntaxException Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = new HoodieAvroDataBlock(records, header); + dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); writer.appendBlock(dataBlock); } } @@ -188,7 +188,7 @@ public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedExc Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); + HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); writer.appendBlock(dataBlock); } finally { if (writer != null) { diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java index 048b2a20e6b2c..27cc31ccea2cf 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRepairsCommand.java @@ -39,7 +39,6 @@ import org.junit.jupiter.api.Test; import org.springframework.shell.core.CommandResult; -import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.URL; @@ -51,6 +50,14 @@ import java.util.Properties; import java.util.stream.Collectors; +import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER; +import static org.apache.hudi.common.table.HoodieTableConfig.NAME; +import static org.apache.hudi.common.table.HoodieTableConfig.TABLE_CHECKSUM; +import static org.apache.hudi.common.table.HoodieTableConfig.TIMELINE_LAYOUT_VERSION; +import static org.apache.hudi.common.table.HoodieTableConfig.TYPE; +import static org.apache.hudi.common.table.HoodieTableConfig.VERSION; +import static org.apache.hudi.common.table.HoodieTableConfig.generateChecksum; +import static org.apache.hudi.common.table.HoodieTableConfig.validateChecksum; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -104,7 +111,7 @@ public void testAddPartitionMetaWithDryRun() throws IOException { // expected all 'No'. String[][] rows = FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, tablePath) .stream() - .map(partition -> new String[]{partition, "No", "None"}) + .map(partition -> new String[] {partition, "No", "None"}) .toArray(String[][]::new); String expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_PARTITION_PATH, HoodieTableHeaderFields.HEADER_METADATA_PRESENT, HoodieTableHeaderFields.HEADER_ACTION}, rows); @@ -135,7 +142,7 @@ public void testAddPartitionMetaWithRealRun() throws IOException { List paths = FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, tablePath); // after dry run, the action will be 'Repaired' String[][] rows = paths.stream() - .map(partition -> new String[]{partition, "No", "Repaired"}) + .map(partition -> new String[] {partition, "No", "Repaired"}) .toArray(String[][]::new); String expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_PARTITION_PATH, HoodieTableHeaderFields.HEADER_METADATA_PRESENT, HoodieTableHeaderFields.HEADER_ACTION}, rows); @@ -147,7 +154,7 @@ public void testAddPartitionMetaWithRealRun() throws IOException { // after real run, Metadata is present now. rows = paths.stream() - .map(partition -> new String[]{partition, "Yes", "None"}) + .map(partition -> new String[] {partition, "Yes", "None"}) .toArray(String[][]::new); expected = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_PARTITION_PATH, HoodieTableHeaderFields.HEADER_METADATA_PRESENT, HoodieTableHeaderFields.HEADER_ACTION}, rows); @@ -170,19 +177,24 @@ public void testOverwriteHoodieProperties() throws IOException { Map oldProps = HoodieCLI.getTableMetaClient().getTableConfig().propsMap(); // after overwrite, the stored value in .hoodie is equals to which read from properties. - Map result = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()).getTableConfig().propsMap(); + HoodieTableConfig tableConfig = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()).getTableConfig(); + Map result = tableConfig.propsMap(); + // validate table checksum + assertTrue(result.containsKey(TABLE_CHECKSUM.key())); + assertTrue(validateChecksum(tableConfig.getProps())); Properties expectProps = new Properties(); - expectProps.load(new FileInputStream(new File(newProps.getPath()))); + expectProps.load(new FileInputStream(newProps.getPath())); Map expected = expectProps.entrySet().stream() .collect(Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue()))); + expected.putIfAbsent(TABLE_CHECKSUM.key(), String.valueOf(generateChecksum(tableConfig.getProps()))); assertEquals(expected, result); // check result - List allPropsStr = Arrays.asList("hoodie.table.name", "hoodie.table.type", "hoodie.table.version", - "hoodie.archivelog.folder", "hoodie.timeline.layout.version"); - String[][] rows = allPropsStr.stream().sorted().map(key -> new String[]{key, - oldProps.getOrDefault(key, "null"), result.getOrDefault(key, "null")}) + List allPropsStr = Arrays.asList(NAME.key(), TYPE.key(), VERSION.key(), + ARCHIVELOG_FOLDER.key(), TIMELINE_LAYOUT_VERSION.key(), TABLE_CHECKSUM.key()); + String[][] rows = allPropsStr.stream().sorted().map(key -> new String[] {key, + oldProps.getOrDefault(key, "null"), result.getOrDefault(key, "null")}) .toArray(String[][]::new); String expect = HoodiePrintHelper.print(new String[] {HoodieTableHeaderFields.HEADER_HOODIE_PROPERTY, HoodieTableHeaderFields.HEADER_OLD_VALUE, HoodieTableHeaderFields.HEADER_NEW_VALUE}, rows); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java index 17bc48f66f0c4..9a10893b35e89 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestRollbacksCommand.java @@ -24,7 +24,7 @@ import org.apache.hudi.cli.HoodieTableHeaderFields; import org.apache.hudi.cli.TableHeader; import org.apache.hudi.cli.functional.CLIFunctionalTestHarness; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -93,7 +93,7 @@ public void init() throws Exception { .withRollbackUsingMarkers(false) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); - try (AbstractHoodieWriteClient client = new SparkRDDWriteClient(context(), config)) { + try (BaseHoodieWriteClient client = new SparkRDDWriteClient(context(), config)) { // Rollback inflight commit3 and commit2 client.rollback("102"); client.rollback("101"); diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUpgradeDowngradeCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUpgradeDowngradeCommand.java index cba6d901b956d..b3650fa027626 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUpgradeDowngradeCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestUpgradeDowngradeCommand.java @@ -106,7 +106,7 @@ public void testDowngradeCommand() throws Exception { assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.ZERO.versionCode()); assertTableVersionFromPropertyFile(); - // verify marker files are non existant + // verify marker files are non existent for (String partitionPath : DEFAULT_PARTITION_PATHS) { assertEquals(0, FileCreateUtils.getTotalMarkerFileCount(tablePath, partitionPath, "101", IOType.MERGE)); } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java new file mode 100644 index 0000000000000..17075f9d3dfb6 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestClusteringCommand.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.integ; + +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.commands.TableCommand; +import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.testutils.HoodieClientTestBase; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.shell.core.CommandResult; + +import java.io.IOException; +import java.nio.file.Paths; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration test class for {@link org.apache.hudi.cli.commands.ClusteringCommand}. + *

+ * A command use SparkLauncher need load jars under lib which generate during mvn package. + * Use integration test instead of unit test. + */ +public class ITTestClusteringCommand extends AbstractShellIntegrationTest { + + private String tablePath; + private String tableName; + + @BeforeEach + public void init() throws IOException { + tableName = "test_table_" + ITTestClusteringCommand.class.getName(); + tablePath = Paths.get(basePath, tableName).toString(); + + HoodieCLI.conf = jsc.hadoopConfiguration(); + // Create table and connect + new TableCommand().createTable( + tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + metaClient.setBasePath(tablePath); + metaClient = HoodieTableMetaClient.reload(metaClient); + } + + /** + * Test case for command 'clustering schedule'. + */ + @Test + public void testScheduleClustering() throws IOException { + // generate commits + generateCommits(); + + CommandResult cr = scheduleClustering(); + assertAll("Command run failed", + () -> assertTrue(cr.isSuccess()), + () -> assertTrue( + cr.getResult().toString().startsWith("Succeeded to schedule clustering for"))); + + // there is 1 requested clustering + HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + assertEquals(1, timeline.filterPendingReplaceTimeline().countInstants()); + } + + /** + * Test case for command 'clustering run'. + */ + @Test + public void testClustering() throws IOException { + // generate commits + generateCommits(); + + CommandResult cr1 = scheduleClustering(); + assertTrue(cr1.isSuccess()); + + // get clustering instance + HoodieActiveTimeline timeline = HoodieCLI.getTableMetaClient().getActiveTimeline(); + Option instance = + timeline.filterPendingReplaceTimeline().firstInstant().map(HoodieInstant::getTimestamp); + assertTrue(instance.isPresent(), "Must have pending clustering."); + + CommandResult cr2 = getShell().executeCommand( + String.format("clustering run --parallelism %s --clusteringInstant %s --sparkMaster %s", + 2, instance, "local")); + + assertAll("Command run failed", + () -> assertTrue(cr2.isSuccess()), + () -> assertTrue( + cr2.getResult().toString().startsWith("Succeeded to run clustering for "))); + + // assert clustering complete + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()).contains(instance), + "Pending clustering must be completed"); + + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .getCompletedReplaceTimeline().getInstants() + .map(HoodieInstant::getTimestamp).collect(Collectors.toList()).contains(instance), + "Pending clustering must be completed"); + } + + /** + * Test case for command 'clustering scheduleAndExecute'. + */ + @Test + public void testClusteringScheduleAndExecute() throws IOException { + // generate commits + generateCommits(); + + CommandResult cr2 = getShell().executeCommand( + String.format("clustering scheduleAndExecute --parallelism %s --sparkMaster %s", 2, "local")); + + assertAll("Command run failed", + () -> assertTrue(cr2.isSuccess()), + () -> assertTrue( + cr2.getResult().toString().startsWith("Succeeded to run clustering for scheduleAndExecute"))); + + // assert clustering complete + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .getCompletedReplaceTimeline().getInstants() + .map(HoodieInstant::getTimestamp).count() > 0, + "Completed clustering couldn't be 0"); + } + + private CommandResult scheduleClustering() { + // generate requested clustering + return getShell().executeCommand( + String.format("clustering schedule --hoodieConfigs hoodie.clustering.inline.max.commits=1 --sparkMaster %s", "local")); + } + + private void generateCommits() throws IOException { + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + // Create the write client to write some records in + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withDeleteParallelism(2).forTable(tableName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build(); + + SparkRDDWriteClient client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg); + + insert(jsc, client, dataGen, "001"); + insert(jsc, client, dataGen, "002"); + } + + private List insert(JavaSparkContext jsc, SparkRDDWriteClient client, + HoodieTestDataGenerator dataGen, String newCommitTime) throws IOException { + // inserts + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 10); + JavaRDD writeRecords = jsc.parallelize(records, 1); + operateFunc(SparkRDDWriteClient::insert, client, writeRecords, newCommitTime); + return records; + } + + private JavaRDD operateFunc( + HoodieClientTestBase.Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + SparkRDDWriteClient client, JavaRDD writeRecords, String commitTime) + throws IOException { + return writeFn.apply(client, writeRecords, commitTime); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java index b3c5c06be9a29..18f4a387d474e 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCommitsCommand.java @@ -100,5 +100,18 @@ public void testRollbackCommit() throws Exception { HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline(); assertEquals(2, timeline.getCommitsTimeline().countInstants(), "There should have 2 instants."); + + // rollback complete commit + CommandResult cr2 = getShell().executeCommand(String.format("commit rollback --commit %s --sparkMaster %s --sparkMemory %s", + "101", "local", "4G")); + assertTrue(cr2.isSuccess()); + + metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); + + HoodieActiveTimeline rollbackTimeline2 = new RollbacksCommand.RollbackTimeline(metaClient); + assertEquals(1, rollbackTimeline2.getRollbackTimeline().countInstants(), "There should have 2 rollback instant."); + + HoodieActiveTimeline timeline2 = metaClient.reloadActiveTimeline(); + assertEquals(2, timeline2.getCommitsTimeline().countInstants(), "There should have 1 instants."); } } diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java index 37a2098d0cd18..4734f45e7074b 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestCompactionCommand.java @@ -140,6 +140,33 @@ public void testCompact() throws IOException { "Pending compaction must be completed"); } + /** + * Test case for command 'compaction scheduleAndExecute'. + */ + @Test + public void testCompactScheduleAndExecute() throws IOException { + // generate commits + generateCommits(); + + String schemaPath = Paths.get(basePath, "compaction.schema").toString(); + writeSchemaToTmpFile(schemaPath); + + CommandResult cr2 = getShell().executeCommand( + String.format("compaction scheduleAndExecute --parallelism %s --schemaFilePath %s --sparkMaster %s", + 2, schemaPath, "local")); + + assertAll("Command run failed", + () -> assertTrue(cr2.isSuccess()), + () -> assertTrue( + cr2.getResult().toString().startsWith("Schedule and execute compaction successfully completed"))); + + // assert compaction complete + assertTrue(HoodieCLI.getTableMetaClient().getActiveTimeline().reload() + .filterCompletedInstants().getInstants() + .map(HoodieInstant::getTimestamp).count() > 0, + "Completed compaction couldn't be 0"); + } + /** * Test case for command 'compaction validate'. */ diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestMarkersCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestMarkersCommand.java new file mode 100644 index 0000000000000..221a29f5250d2 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestMarkersCommand.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.integ; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.cli.commands.TableCommand; +import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.IOType; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.testutils.FileCreateUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.shell.core.CommandResult; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration test class for {@link org.apache.hudi.cli.commands.MarkersCommand}. + *

+ * A command use SparkLauncher need load jars under lib which generate during mvn package. + * Use integration test instead of unit test. + */ +public class ITTestMarkersCommand extends AbstractShellIntegrationTest { + + private String tablePath; + + @BeforeEach + public void init() throws IOException { + String tableName = "test_table"; + tablePath = basePath + Path.SEPARATOR + tableName; + + // Create table and connect + new TableCommand().createTable( + tablePath, "test_table", HoodieTableType.COPY_ON_WRITE.name(), + "", TimelineLayoutVersion.VERSION_1, "org.apache.hudi.common.model.HoodieAvroPayload"); + } + + /** + * Test case of command 'marker delete'. + */ + @Test + public void testDeleteMarker() throws IOException { + // generate markers + String instantTime1 = "101"; + + FileCreateUtils.createMarkerFile(tablePath, "partA", instantTime1, "f0", IOType.APPEND); + FileCreateUtils.createMarkerFile(tablePath, "partA", instantTime1, "f1", IOType.APPEND); + + assertEquals(2, FileCreateUtils.getTotalMarkerFileCount(tablePath, "partA", instantTime1, IOType.APPEND)); + + CommandResult cr = getShell().executeCommand( + String.format("marker delete --commit %s --sparkMaster %s", instantTime1, "local")); + assertTrue(cr.isSuccess()); + + assertEquals(0, FileCreateUtils.getTotalMarkerFileCount(tablePath, "partA", instantTime1, IOType.APPEND)); + } +} diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieTestCommitMetadataGenerator.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieTestCommitMetadataGenerator.java index 105a9f639c792..f59dca4e1ea9f 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieTestCommitMetadataGenerator.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/HoodieTestCommitMetadataGenerator.java @@ -73,9 +73,9 @@ public static void createCommitFileWithMetadata(String basePath, String commitTi } public static void createCommitFileWithMetadata(String basePath, String commitTime, Configuration configuration, - Option writes, Option updates, Map extraMetdata) throws Exception { + Option writes, Option updates, Map extraMetadata) throws Exception { createCommitFileWithMetadata(basePath, commitTime, configuration, UUID.randomUUID().toString(), - UUID.randomUUID().toString(), writes, updates, extraMetdata); + UUID.randomUUID().toString(), writes, updates, extraMetadata); } public static void createCommitFileWithMetadata(String basePath, String commitTime, Configuration configuration, diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/SparkUtilTest.java b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/SparkUtilTest.java new file mode 100644 index 0000000000000..4966438292949 --- /dev/null +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/testutils/SparkUtilTest.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.testutils; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.spark.SparkConf; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class SparkUtilTest { + @Test + public void testGetDefaultSparkConf() { + SparkConf sparkConf = SparkUtil.getDefaultConf("test-spark-app", Option.of("")); + assertEquals(SparkUtil.DEFAULT_SPARK_MASTER, sparkConf.get("spark.master")); + } +} diff --git a/hudi-client/hudi-client-common/pom.xml b/hudi-client/hudi-client-common/pom.xml index a9209f5534df8..a55a136652728 100644 --- a/hudi-client/hudi-client-common/pom.xml +++ b/hudi-client/hudi-client-common/pom.xml @@ -163,7 +163,6 @@ org.awaitility awaitility - 3.1.2 test diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncArchiveService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncArchiveService.java new file mode 100644 index 0000000000000..3fdc21dd21683 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncArchiveService.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Async archive service to run concurrently with write operation. + */ +public class AsyncArchiveService extends HoodieAsyncTableService { + + private static final Logger LOG = LogManager.getLogger(AsyncArchiveService.class); + + private final BaseHoodieWriteClient writeClient; + private final transient ExecutorService executor = Executors.newSingleThreadExecutor(); + + protected AsyncArchiveService(BaseHoodieWriteClient writeClient) { + super(writeClient.getConfig()); + this.writeClient = writeClient; + } + + @Override + protected Pair startService() { + LOG.info("Starting async archive service..."); + return Pair.of(CompletableFuture.supplyAsync(() -> { + writeClient.archive(); + return true; + }, executor), executor); + } + + public static AsyncArchiveService startAsyncArchiveIfEnabled(BaseHoodieWriteClient writeClient) { + HoodieWriteConfig config = writeClient.getConfig(); + if (!config.isAutoArchive() || !config.isAsyncArchive()) { + LOG.info("The HoodieWriteClient is not configured to auto & async archive. Async archive service will not start."); + return null; + } + AsyncArchiveService asyncArchiveService = new AsyncArchiveService(writeClient); + asyncArchiveService.start(null); + return asyncArchiveService; + } + + public static void waitForCompletion(AsyncArchiveService asyncArchiveService) { + if (asyncArchiveService != null) { + LOG.info("Waiting for async archive service to finish"); + try { + asyncArchiveService.waitForShutdown(); + } catch (Exception e) { + throw new HoodieException("Error waiting for async archive service to finish", e); + } + } + } + + public static void forceShutdown(AsyncArchiveService asyncArchiveService) { + if (asyncArchiveService != null) { + LOG.info("Shutting down async archive service..."); + asyncArchiveService.shutdown(true); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AsyncCleanerService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCleanerService.java similarity index 56% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AsyncCleanerService.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCleanerService.java index a5a38f2cc5949..72907e6d3fbcd 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AsyncCleanerService.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCleanerService.java @@ -7,21 +7,24 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ -package org.apache.hudi.client; +package org.apache.hudi.async; -import org.apache.hudi.async.HoodieAsyncService; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -30,54 +33,55 @@ import java.util.concurrent.Executors; /** - * Clean service running concurrently with write operation. + * Async clean service to run concurrently with write operation. */ -class AsyncCleanerService extends HoodieAsyncService { +public class AsyncCleanerService extends HoodieAsyncTableService { private static final Logger LOG = LogManager.getLogger(AsyncCleanerService.class); - private final AbstractHoodieWriteClient writeClient; + private final BaseHoodieWriteClient writeClient; private final transient ExecutorService executor = Executors.newSingleThreadExecutor(); - protected AsyncCleanerService(AbstractHoodieWriteClient writeClient) { + protected AsyncCleanerService(BaseHoodieWriteClient writeClient) { + super(writeClient.getConfig()); this.writeClient = writeClient; } @Override protected Pair startService() { String instantTime = HoodieActiveTimeline.createNewInstantTime(); - LOG.info("Auto cleaning is enabled. Running cleaner async to write operation at instant time " + instantTime); + LOG.info(String.format("Starting async clean service with instant time %s...", instantTime)); return Pair.of(CompletableFuture.supplyAsync(() -> { writeClient.clean(instantTime); return true; }, executor), executor); } - public static AsyncCleanerService startAsyncCleaningIfEnabled(AbstractHoodieWriteClient writeClient) { - AsyncCleanerService asyncCleanerService = null; - if (writeClient.getConfig().isAutoClean() && writeClient.getConfig().isAsyncClean()) { - asyncCleanerService = new AsyncCleanerService(writeClient); - asyncCleanerService.start(null); - } else { - LOG.info("Async auto cleaning is not enabled. Not running cleaner now"); + public static AsyncCleanerService startAsyncCleaningIfEnabled(BaseHoodieWriteClient writeClient) { + HoodieWriteConfig config = writeClient.getConfig(); + if (!config.isAutoClean() || !config.isAsyncClean()) { + LOG.info("The HoodieWriteClient is not configured to auto & async clean. Async clean service will not start."); + return null; } + AsyncCleanerService asyncCleanerService = new AsyncCleanerService(writeClient); + asyncCleanerService.start(null); return asyncCleanerService; } public static void waitForCompletion(AsyncCleanerService asyncCleanerService) { if (asyncCleanerService != null) { - LOG.info("Waiting for async cleaner to finish"); + LOG.info("Waiting for async clean service to finish"); try { asyncCleanerService.waitForShutdown(); } catch (Exception e) { - throw new HoodieException("Error waiting for async cleaning to finish", e); + throw new HoodieException("Error waiting for async clean service to finish", e); } } } public static void forceShutdown(AsyncCleanerService asyncCleanerService) { if (asyncCleanerService != null) { - LOG.info("Shutting down async cleaner"); + LOG.info("Shutting down async clean service..."); asyncCleanerService.shutdown(true); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncClusteringService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncClusteringService.java index b9707bb6d82a7..1c1cf2bb9f74b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncClusteringService.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncClusteringService.java @@ -19,8 +19,8 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractClusteringClient; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseClusterer; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieIOException; @@ -38,25 +38,25 @@ * Async clustering service that runs in a separate thread. * Currently, only one clustering thread is allowed to run at any time. */ -public abstract class AsyncClusteringService extends HoodieAsyncService { +public abstract class AsyncClusteringService extends HoodieAsyncTableService { private static final long serialVersionUID = 1L; private static final Logger LOG = LogManager.getLogger(AsyncClusteringService.class); private final int maxConcurrentClustering; - private transient AbstractClusteringClient clusteringClient; + private transient BaseClusterer clusteringClient; - public AsyncClusteringService(AbstractHoodieWriteClient writeClient) { + public AsyncClusteringService(BaseHoodieWriteClient writeClient) { this(writeClient, false); } - public AsyncClusteringService(AbstractHoodieWriteClient writeClient, boolean runInDaemonMode) { - super(runInDaemonMode); + public AsyncClusteringService(BaseHoodieWriteClient writeClient, boolean runInDaemonMode) { + super(writeClient.getConfig(), runInDaemonMode); this.clusteringClient = createClusteringClient(writeClient); this.maxConcurrentClustering = 1; } - protected abstract AbstractClusteringClient createClusteringClient(AbstractHoodieWriteClient client); + protected abstract BaseClusterer createClusteringClient(BaseHoodieWriteClient client); /** * Start clustering service. @@ -82,10 +82,16 @@ protected Pair startService() { } LOG.info("Clustering executor shutting down properly"); } catch (InterruptedException ie) { + hasError = true; LOG.warn("Clustering executor got interrupted exception! Stopping", ie); } catch (IOException e) { - LOG.error("Clustering executor failed", e); + hasError = true; + LOG.error("Clustering executor failed due to IOException", e); throw new HoodieIOException(e.getMessage(), e); + } catch (Exception e) { + hasError = true; + LOG.error("Clustering executor failed", e); + throw e; } return true; }, executor)).toArray(CompletableFuture[]::new)), executor); @@ -94,7 +100,7 @@ protected Pair startService() { /** * Update the write client to be used for clustering. */ - public synchronized void updateWriteClient(AbstractHoodieWriteClient writeClient) { + public synchronized void updateWriteClient(BaseHoodieWriteClient writeClient) { this.clusteringClient.updateWriteClient(writeClient); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCompactService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCompactService.java index 2f63297210e14..f1f7f416e466c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCompactService.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/AsyncCompactService.java @@ -17,8 +17,8 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractCompactor; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseCompactor; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.common.engine.EngineProperty; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -37,7 +37,7 @@ /** * Async Compactor Service that runs in separate thread. Currently, only one compactor is allowed to run at any time. */ -public abstract class AsyncCompactService extends HoodieAsyncService { +public abstract class AsyncCompactService extends HoodieAsyncTableService { private static final long serialVersionUID = 1L; private static final Logger LOG = LogManager.getLogger(AsyncCompactService.class); @@ -48,21 +48,21 @@ public abstract class AsyncCompactService extends HoodieAsyncService { public static final String COMPACT_POOL_NAME = "hoodiecompact"; private final int maxConcurrentCompaction; - private transient AbstractCompactor compactor; + private transient BaseCompactor compactor; protected transient HoodieEngineContext context; - public AsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client) { + public AsyncCompactService(HoodieEngineContext context, BaseHoodieWriteClient client) { this(context, client, false); } - public AsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client, boolean runInDaemonMode) { - super(runInDaemonMode); + public AsyncCompactService(HoodieEngineContext context, BaseHoodieWriteClient client, boolean runInDaemonMode) { + super(client.getConfig(), runInDaemonMode); this.context = context; this.compactor = createCompactor(client); this.maxConcurrentCompaction = 1; } - protected abstract AbstractCompactor createCompactor(AbstractHoodieWriteClient client); + protected abstract BaseCompactor createCompactor(BaseHoodieWriteClient client); /** * Start Compaction Service. @@ -92,10 +92,16 @@ protected Pair startService() { } LOG.info("Compactor shutting down properly!!"); } catch (InterruptedException ie) { + hasError = true; LOG.warn("Compactor executor thread got interrupted exception. Stopping", ie); } catch (IOException e) { - LOG.error("Compactor executor failed", e); + hasError = true; + LOG.error("Compactor executor failed due to IOException", e); throw new HoodieIOException(e.getMessage(), e); + } catch (Exception e) { + hasError = true; + LOG.error("Compactor executor failed", e); + throw e; } return true; }, executor)).toArray(CompletableFuture[]::new)), executor); @@ -110,7 +116,7 @@ protected boolean shouldStopCompactor() { return false; } - public synchronized void updateWriteClient(AbstractHoodieWriteClient writeClient) { + public synchronized void updateWriteClient(BaseHoodieWriteClient writeClient) { this.compactor.updateWriteClient(writeClient); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncService.java index f57484d886c9b..1ce6dfb288d62 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncService.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncService.java @@ -36,12 +36,15 @@ import java.util.function.Function; /** - * Base Class for running clean/delta-sync/compaction/clustering in separate thread and controlling their life-cycle. + * Base Class for running archive/clean/delta-sync/compaction/clustering in separate thread and controlling their life-cycles. */ public abstract class HoodieAsyncService implements Serializable { private static final Logger LOG = LogManager.getLogger(HoodieAsyncService.class); + private static final long POLLING_SECONDS = 10; + // Flag indicating whether an error is incurred in the service + protected boolean hasError; // Flag to track if the service is started. private boolean started; // Flag indicating shutdown is externally requested @@ -70,21 +73,32 @@ protected HoodieAsyncService(boolean runInDaemonMode) { this.runInDaemonMode = runInDaemonMode; } - protected boolean isShutdownRequested() { + public boolean isStarted() { + return started; + } + + public boolean isShutdownRequested() { return shutdownRequested; } - protected boolean isShutdown() { + public boolean isShutdown() { return shutdown; } + public boolean hasError() { + return hasError; + } + /** * Wait till the service shutdown. If the service shutdown with exception, it will be thrown - * + * * @throws ExecutionException * @throws InterruptedException */ public void waitForShutdown() throws ExecutionException, InterruptedException { + if (future == null) { + return; + } try { future.get(); } catch (ExecutionException ex) { @@ -102,6 +116,7 @@ public void waitForShutdown() throws ExecutionException, InterruptedException { public void shutdown(boolean force) { if (!shutdownRequested || force) { shutdownRequested = true; + shutdown = true; if (executor != null) { if (force) { executor.shutdownNow(); @@ -125,6 +140,10 @@ public void shutdown(boolean force) { * @param onShutdownCallback */ public void start(Function onShutdownCallback) { + if (started) { + LOG.warn("The async service already started."); + return; + } Pair res = startService(); future = res.getKey(); executor = res.getValue(); @@ -134,8 +153,6 @@ public void start(Function onShutdownCallback) { /** * Service implementation. - * - * @return */ protected abstract Pair startService(); @@ -146,6 +163,9 @@ public void start(Function onShutdownCallback) { */ @SuppressWarnings("unchecked") private void shutdownCallback(Function callback) { + if (future == null) { + return; + } future.whenComplete((resp, error) -> { if (null != callback) { callback.apply(null != error); @@ -166,8 +186,8 @@ public boolean isRunInDaemonMode() { public void waitTillPendingAsyncServiceInstantsReducesTo(int numPending) throws InterruptedException { try { queueLock.lock(); - while (!isShutdown() && (pendingInstants.size() > numPending)) { - consumed.await(); + while (!isShutdown() && !hasError() && (pendingInstants.size() > numPending)) { + consumed.await(POLLING_SECONDS, TimeUnit.SECONDS); } } finally { queueLock.unlock(); @@ -190,8 +210,8 @@ public void enqueuePendingAsyncServiceInstant(HoodieInstant instant) { * @throws InterruptedException */ HoodieInstant fetchNextAsyncServiceInstant() throws InterruptedException { - LOG.info("Waiting for next instant upto 10 seconds"); - HoodieInstant instant = pendingInstants.poll(10, TimeUnit.SECONDS); + LOG.info(String.format("Waiting for next instant up to %d seconds", POLLING_SECONDS)); + HoodieInstant instant = pendingInstants.poll(POLLING_SECONDS, TimeUnit.SECONDS); if (instant != null) { try { queueLock.lock(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncTableService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncTableService.java new file mode 100644 index 0000000000000..6a53d30063c1d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/async/HoodieAsyncTableService.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.RunsTableService; +import org.apache.hudi.config.HoodieWriteConfig; + +import java.util.function.Function; + +public abstract class HoodieAsyncTableService extends HoodieAsyncService implements RunsTableService { + + protected HoodieWriteConfig writeConfig; + + protected HoodieAsyncTableService() { + } + + protected HoodieAsyncTableService(HoodieWriteConfig writeConfig) { + this.writeConfig = writeConfig; + } + + protected HoodieAsyncTableService(HoodieWriteConfig writeConfig, boolean runInDaemonMode) { + super(runInDaemonMode); + this.writeConfig = writeConfig; + } + + @Override + public void start(Function onShutdownCallback) { + if (!tableServicesEnabled(writeConfig)) { + return; + } + super.start(onShutdownCallback); + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractClusteringClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseClusterer.java similarity index 80% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractClusteringClient.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseClusterer.java index 34234f546ed19..648ce805b0825 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractClusteringClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseClusterer.java @@ -28,13 +28,13 @@ /** * Client will run one round of clustering. */ -public abstract class AbstractClusteringClient implements Serializable { +public abstract class BaseClusterer implements Serializable { private static final long serialVersionUID = 1L; - protected transient AbstractHoodieWriteClient clusteringClient; + protected transient BaseHoodieWriteClient clusteringClient; - public AbstractClusteringClient(AbstractHoodieWriteClient clusteringClient) { + public BaseClusterer(BaseHoodieWriteClient clusteringClient) { this.clusteringClient = clusteringClient; } @@ -49,7 +49,7 @@ public AbstractClusteringClient(AbstractHoodieWriteClient clustering * Update the write client used by async clustering. * @param writeClient */ - public void updateWriteClient(AbstractHoodieWriteClient writeClient) { + public void updateWriteClient(BaseHoodieWriteClient writeClient) { this.clusteringClient = writeClient; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractCompactor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseCompactor.java similarity index 78% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractCompactor.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseCompactor.java index c80b34a3ef656..88737dbcf1d7e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractCompactor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseCompactor.java @@ -27,19 +27,19 @@ /** * Run one round of compaction. */ -public abstract class AbstractCompactor implements Serializable { +public abstract class BaseCompactor implements Serializable { private static final long serialVersionUID = 1L; - protected transient AbstractHoodieWriteClient compactionClient; + protected transient BaseHoodieWriteClient compactionClient; - public AbstractCompactor(AbstractHoodieWriteClient compactionClient) { + public BaseCompactor(BaseHoodieWriteClient compactionClient) { this.compactionClient = compactionClient; } public abstract void compact(HoodieInstant instant) throws IOException; - public void updateWriteClient(AbstractHoodieWriteClient writeClient) { + public void updateWriteClient(BaseHoodieWriteClient writeClient) { this.compactionClient = writeClient; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java similarity index 91% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieClient.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java index 350fe0c9bf7e0..3f208a0f86a09 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieClient.java @@ -40,9 +40,9 @@ * Abstract class taking care of holding common member variables (FileSystem, SparkContext, HoodieConfigs) Also, manages * embedded timeline-server if enabled. */ -public abstract class AbstractHoodieClient implements Serializable, AutoCloseable { +public abstract class BaseHoodieClient implements Serializable, AutoCloseable { - private static final Logger LOG = LogManager.getLogger(AbstractHoodieClient.class); + private static final Logger LOG = LogManager.getLogger(BaseHoodieClient.class); protected final transient FileSystem fs; protected final transient HoodieEngineContext context; @@ -59,11 +59,11 @@ public abstract class AbstractHoodieClient implements Serializable, AutoCloseabl private transient Option timelineServer; private final boolean shouldStopTimelineServer; - protected AbstractHoodieClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { + protected BaseHoodieClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { this(context, clientConfig, Option.empty()); } - protected AbstractHoodieClient(HoodieEngineContext context, HoodieWriteConfig clientConfig, + protected BaseHoodieClient(HoodieEngineContext context, HoodieWriteConfig clientConfig, Option timelineServer) { this.hadoopConf = context.getHadoopConf().get(); this.fs = FSUtils.getFs(clientConfig.getBasePath(), hadoopConf); @@ -134,7 +134,8 @@ protected void initWrapperFSMetrics() { protected HoodieTableMetaClient createMetaClient(boolean loadActiveTimelineOnLoad) { return HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(config.getBasePath()) .setLoadActiveTimelineOnLoad(loadActiveTimelineOnLoad).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) - .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))).build(); + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig()).build(); } public Option getTimelineServer() { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java similarity index 87% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java index c9162de9e9ca1..7b67ff54a2aa5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieWriteClient.java @@ -18,11 +18,14 @@ package org.apache.hudi.client; +import org.apache.hudi.async.AsyncArchiveService; +import org.apache.hudi.async.AsyncCleanerService; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.callback.HoodieWriteCommitCallback; @@ -66,7 +69,6 @@ import org.apache.hudi.metrics.HoodieMetrics; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.HoodieTimelineArchiveLog; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.rollback.RollbackUtils; import org.apache.hudi.table.action.savepoint.SavepointHelpers; @@ -98,14 +100,15 @@ * @param Type of keys * @param Type of outputs */ -public abstract class AbstractHoodieWriteClient extends AbstractHoodieClient { +public abstract class BaseHoodieWriteClient extends BaseHoodieClient + implements RunsTableService { protected static final String LOOKUP_STR = "lookup"; private static final long serialVersionUID = 1L; - private static final Logger LOG = LogManager.getLogger(AbstractHoodieWriteClient.class); + private static final Logger LOG = LogManager.getLogger(BaseHoodieWriteClient.class); protected final transient HoodieMetrics metrics; - private final transient HoodieIndex index; + private final transient HoodieIndex index; protected transient Timer.Context writeTimer = null; protected transient Timer.Context compactionTimer; @@ -114,6 +117,7 @@ public abstract class AbstractHoodieWriteClient>> lastCompletedTxnAndMetadata = Option.empty(); @@ -123,7 +127,7 @@ public abstract class AbstractHoodieWriteClient timelineService) { super(context, writeConfig, timelineService); this.metrics = new HoodieMetrics(config); @@ -142,7 +146,7 @@ public AbstractHoodieWriteClient(HoodieEngineContext context, HoodieWriteConfig this.txnManager = new TransactionManager(config, fs); } - protected abstract HoodieIndex createIndex(HoodieWriteConfig writeConfig); + protected abstract HoodieIndex createIndex(HoodieWriteConfig writeConfig); public void setOperationType(WriteOperationType operationType) { this.operationType = operationType; @@ -359,7 +363,7 @@ public void rollbackFailedBootstrap() { * table for the very first time (e.g: converting an existing table to Hoodie). *

* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control - * the numbers of files with less memory compared to the {@link AbstractHoodieWriteClient#insert(I, String)} + * the numbers of files with less memory compared to the {@link BaseHoodieWriteClient#insert(I, String)} * * @param records HoodieRecords to insert * @param instantTime Instant time of the commit @@ -372,7 +376,7 @@ public void rollbackFailedBootstrap() { * table for the very first time (e.g: converting an existing table to Hoodie). *

* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control - * the numbers of files with less memory compared to the {@link AbstractHoodieWriteClient#insert(I, String)}. Optionally + * the numbers of files with less memory compared to the {@link BaseHoodieWriteClient#insert(I, String)}. Optionally * it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See * {@link BulkInsertPartitioner}. * @@ -392,7 +396,7 @@ public abstract O bulkInsert(I records, final String instantTime, * duplicates if needed. *

* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control - * the numbers of files with less memory compared to the {@link AbstractHoodieWriteClient#insert(I, String)}. Optionally + * the numbers of files with less memory compared to the {@link BaseHoodieWriteClient#insert(I, String)}. Optionally * it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See * {@link BulkInsertPartitioner}. * @@ -430,6 +434,11 @@ protected void preWrite(String instantTime, WriteOperationType writeOperationTyp } else { this.asyncCleanerService.start(null); } + if (null == this.asyncArchiveService) { + this.asyncArchiveService = AsyncArchiveService.startAsyncArchiveIfEnabled(this); + } else { + this.asyncArchiveService.start(null); + } } /** @@ -455,16 +464,17 @@ protected void postCommit(HoodieTable table, HoodieCommitMetadata me WriteMarkersFactory.get(config.getMarkersType(), table, instantTime) .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); autoCleanOnCommit(); - if (config.isAutoArchive()) { - archive(table); - } + autoArchiveOnCommit(table); } finally { this.heartbeatClient.stop(instantTime); } } protected void runTableServicesInline(HoodieTable table, HoodieCommitMetadata metadata, Option> extraMetadata) { - if (config.areAnyTableServicesInline()) { + if (!tableServicesEnabled(config)) { + return; + } + if (config.areAnyTableServicesExecutedInline() || config.areAnyTableServicesScheduledInline()) { if (config.isMetadataTableEnabled()) { table.getHoodieView().sync(); } @@ -472,19 +482,35 @@ protected void runTableServicesInline(HoodieTable table, HoodieCommi if (config.inlineCompactionEnabled()) { runAnyPendingCompactions(table); metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT.key(), "true"); - inlineCompact(extraMetadata); + inlineCompaction(extraMetadata); } else { metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT.key(), "false"); } + // if just inline schedule is enabled + if (!config.inlineCompactionEnabled() && config.scheduleInlineCompaction() + && !table.getActiveTimeline().getWriteTimeline().filterPendingCompactionTimeline().getInstants().findAny().isPresent()) { + // proceed only if there are no pending compactions + metadata.addMetadata(HoodieCompactionConfig.SCHEDULE_INLINE_COMPACT.key(), "true"); + inlineScheduleCompaction(extraMetadata); + } + // Do an inline clustering if enabled if (config.inlineClusteringEnabled()) { runAnyPendingClustering(table); metadata.addMetadata(HoodieClusteringConfig.INLINE_CLUSTERING.key(), "true"); - inlineCluster(extraMetadata); + inlineClustering(extraMetadata); } else { metadata.addMetadata(HoodieClusteringConfig.INLINE_CLUSTERING.key(), "false"); } + + // if just inline schedule is enabled + if (!config.inlineClusteringEnabled() && config.scheduleInlineClustering() + && !table.getActiveTimeline().filterPendingReplaceTimeline().getInstants().findAny().isPresent()) { + // proceed only if there are no pending clustering + metadata.addMetadata(HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING.key(), "true"); + inlineScheduleClustering(extraMetadata); + } } } @@ -506,22 +532,34 @@ protected void runAnyPendingClustering(HoodieTable table) { }); } - /** - * Handle auto clean during commit. - * - */ protected void autoCleanOnCommit() { - if (config.isAutoClean()) { - // Call clean to cleanup if there is anything to cleanup after the commit, - if (config.isAsyncClean()) { - LOG.info("Cleaner has been spawned already. Waiting for it to finish"); - AsyncCleanerService.waitForCompletion(asyncCleanerService); - LOG.info("Cleaner has finished"); - } else { - // Do not reuse instantTime for clean as metadata table requires all changes to have unique instant timestamps. - LOG.info("Auto cleaning is enabled. Running cleaner now"); - clean(true); - } + if (!config.isAutoClean()) { + return; + } + + if (config.isAsyncClean()) { + LOG.info("Async cleaner has been spawned. Waiting for it to finish"); + AsyncCleanerService.waitForCompletion(asyncCleanerService); + LOG.info("Async cleaner has finished"); + } else { + LOG.info("Start to clean synchronously."); + // Do not reuse instantTime for clean as metadata table requires all changes to have unique instant timestamps. + clean(true); + } + } + + protected void autoArchiveOnCommit(HoodieTable table) { + if (!config.isAutoArchive()) { + return; + } + + if (config.isAsyncArchive()) { + LOG.info("Async archiver has been spawned. Waiting for it to finish"); + AsyncArchiveService.waitForCompletion(asyncArchiveService); + LOG.info("Async archiver has finished"); + } else { + LOG.info("Start to archive synchronously."); + archive(table); } } @@ -606,7 +644,7 @@ public boolean rollback(final String commitInstantTime) throws HoodieRollbackExc /** * @Deprecated * Rollback the inflight record changes with the given commit time. This - * will be removed in future in favor of {@link AbstractHoodieWriteClient#restoreToInstant(String)} + * will be removed in future in favor of {@link BaseHoodieWriteClient#restoreToInstant(String)} * Adding this api for backwards compatability. * @param commitInstantTime Instant time of the commit * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. @@ -620,7 +658,7 @@ public boolean rollback(final String commitInstantTime, boolean skipLocking) thr /** * @Deprecated * Rollback the inflight record changes with the given commit time. This - * will be removed in future in favor of {@link AbstractHoodieWriteClient#restoreToInstant(String)} + * will be removed in future in favor of {@link BaseHoodieWriteClient#restoreToInstant(String)} * * @param commitInstantTime Instant time of the commit * @param pendingRollbackInfo pending rollback instant and plan if rollback failed from previous attempt. @@ -639,8 +677,8 @@ public boolean rollback(final String commitInstantTime, Option rollbackPlanOption = pendingRollbackInfo.map(entry -> Option.of(entry.getRollbackPlan())).orElse(table.scheduleRollback(context, rollbackInstantTime, - commitInstantOpt.get(), false, config.shouldRollbackUsingMarkers())); + Option rollbackPlanOption = pendingRollbackInfo.map(entry -> Option.of(entry.getRollbackPlan())) + .orElseGet(() -> table.scheduleRollback(context, rollbackInstantTime, commitInstantOpt.get(), false, config.shouldRollbackUsingMarkers())); if (rollbackPlanOption.isPresent()) { // execute rollback HoodieRollbackMetadata rollbackMetadata = table.rollback(context, rollbackInstantTime, commitInstantOpt.get(), true, @@ -674,16 +712,21 @@ public HoodieRestoreMetadata restoreToInstant(final String instantTime) throws H Timer.Context timerContext = metrics.getRollbackCtx(); try { HoodieTable table = createTable(config, hadoopConf, config.isMetadataTableEnabled()); - HoodieRestoreMetadata restoreMetadata = table.restore(context, restoreInstantTime, instantTime); - if (timerContext != null) { - final long durationInMs = metrics.getDurationInMs(timerContext.stop()); - final long totalFilesDeleted = restoreMetadata.getHoodieRestoreMetadata().values().stream() - .flatMap(Collection::stream) - .mapToLong(HoodieRollbackMetadata::getTotalFilesDeleted) - .sum(); - metrics.updateRollbackMetrics(durationInMs, totalFilesDeleted); + Option restorePlanOption = table.scheduleRestore(context, restoreInstantTime, instantTime); + if (restorePlanOption.isPresent()) { + HoodieRestoreMetadata restoreMetadata = table.restore(context, restoreInstantTime, instantTime); + if (timerContext != null) { + final long durationInMs = metrics.getDurationInMs(timerContext.stop()); + final long totalFilesDeleted = restoreMetadata.getHoodieRestoreMetadata().values().stream() + .flatMap(Collection::stream) + .mapToLong(HoodieRollbackMetadata::getTotalFilesDeleted) + .sum(); + metrics.updateRollbackMetrics(durationInMs, totalFilesDeleted); + } + return restoreMetadata; + } else { + throw new HoodieRestoreException("Failed to restore " + config.getBasePath() + " to commit " + instantTime); } - return restoreMetadata; } catch (Exception e) { throw new HoodieRestoreException("Failed to restore to " + instantTime, e); } @@ -714,28 +757,38 @@ public HoodieCleanMetadata clean(String cleanInstantTime, boolean skipLocking) t * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be * cleaned). This API provides the flexibility to schedule clean instant asynchronously via - * {@link AbstractHoodieWriteClient#scheduleTableService(String, Option, TableServiceType)} and disable inline scheduling + * {@link BaseHoodieWriteClient#scheduleTableService(String, Option, TableServiceType)} and disable inline scheduling * of clean. * @param cleanInstantTime instant time for clean. * @param scheduleInline true if needs to be scheduled inline. false otherwise. * @param skipLocking if this is triggered by another parent transaction, locking can be skipped. */ public HoodieCleanMetadata clean(String cleanInstantTime, boolean scheduleInline, boolean skipLocking) throws HoodieIOException { - if (scheduleInline) { - scheduleTableServiceInternal(cleanInstantTime, Option.empty(), TableServiceType.CLEAN); + if (!tableServicesEnabled(config)) { + return null; } - LOG.info("Cleaner started"); final Timer.Context timerContext = metrics.getCleanCtx(); - LOG.info("Cleaned failed attempts if any"); CleanerUtils.rollbackFailedWrites(config.getFailedWritesCleanPolicy(), HoodieTimeline.CLEAN_ACTION, () -> rollbackFailedWrites(skipLocking)); - HoodieCleanMetadata metadata = createTable(config, hadoopConf).clean(context, cleanInstantTime, skipLocking); - if (timerContext != null && metadata != null) { - long durationMs = metrics.getDurationInMs(timerContext.stop()); - metrics.updateCleanMetrics(durationMs, metadata.getTotalFilesDeleted()); - LOG.info("Cleaned " + metadata.getTotalFilesDeleted() + " files" - + " Earliest Retained Instant :" + metadata.getEarliestCommitToRetain() - + " cleanerElapsedMs" + durationMs); + + HoodieCleanMetadata metadata = null; + HoodieTable table = createTable(config, hadoopConf); + if (config.allowMultipleCleans() || !table.getActiveTimeline().getCleanerTimeline().filterInflightsAndRequested().firstInstant().isPresent()) { + LOG.info("Cleaner started"); + // proceed only if multiple clean schedules are enabled or if there are no pending cleans. + if (scheduleInline) { + scheduleTableServiceInternal(cleanInstantTime, Option.empty(), TableServiceType.CLEAN); + table.getMetaClient().reloadActiveTimeline(); + } + + metadata = table.clean(context, cleanInstantTime, skipLocking); + if (timerContext != null && metadata != null) { + long durationMs = metrics.getDurationInMs(timerContext.stop()); + metrics.updateCleanMetrics(durationMs, metadata.getTotalFilesDeleted()); + LOG.info("Cleaned " + metadata.getTotalFilesDeleted() + " files" + + " Earliest Retained Instant :" + metadata.getEarliestCommitToRetain() + + " cleanerElapsedMs" + durationMs); + } } return metadata; } @@ -760,10 +813,13 @@ public HoodieCleanMetadata clean(boolean skipLocking) { * @param table table to commit on. */ protected void archive(HoodieTable table) { + if (!tableServicesEnabled(config)) { + return; + } try { // We cannot have unbounded commit files. Archive commits if we have to archive - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(config, table); - archiveLog.archiveIfRequired(context); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(config, table); + archiver.archiveIfRequired(context); } catch (IOException ioe) { throw new HoodieIOException("Failed to archive", ioe); } @@ -861,7 +917,7 @@ public boolean scheduleCompactionAtInstant(String instantTime, Option compact(String compactionInstantTime) { return compact(compactionInstantTime, config.shouldAutoCommit()); } @@ -869,17 +925,16 @@ public O compact(String compactionInstantTime) { * Commit a compaction operation. Allow passing additional meta-data to be stored in commit instant file. * * @param compactionInstantTime Compaction Instant Time - * @param writeStatuses Collection of WriteStatus to inspect errors and counts + * @param metadata All the metadata that gets stored along with a commit * @param extraMetadata Extra Metadata to be stored */ - public abstract void commitCompaction(String compactionInstantTime, O writeStatuses, - Option> extraMetadata) throws IOException; + public abstract void commitCompaction(String compactionInstantTime, HoodieCommitMetadata metadata, + Option> extraMetadata); /** * Commit Compaction and track metrics. */ - protected abstract void completeCompaction(HoodieCommitMetadata metadata, O writeStatuses, - HoodieTable table, String compactionCommitTime); + protected abstract void completeCompaction(HoodieCommitMetadata metadata, HoodieTable table, String compactionCommitTime); /** * Get inflight time line exclude compaction and clustering. @@ -1001,13 +1056,14 @@ protected List getInstantsToRollback(HoodieTableMetaClient metaClient, H * @param compactionInstantTime Compaction Instant Time * @return Collection of Write Status */ - protected abstract O compact(String compactionInstantTime, boolean shouldComplete); + protected abstract HoodieWriteMetadata compact(String compactionInstantTime, boolean shouldComplete); /** * Performs a compaction operation on a table, serially before or after an insert/upsert action. + * Scheduling and execution is done inline. */ - protected Option inlineCompact(Option> extraMetadata) { - Option compactionInstantTimeOpt = scheduleCompaction(extraMetadata); + protected Option inlineCompaction(Option> extraMetadata) { + Option compactionInstantTimeOpt = inlineScheduleCompaction(extraMetadata); compactionInstantTimeOpt.ifPresent(compactInstantTime -> { // inline compaction should auto commit as the user is never given control compact(compactInstantTime, true); @@ -1015,6 +1071,15 @@ protected Option inlineCompact(Option> extraMetadata return compactionInstantTimeOpt; } + /*** + * Schedules compaction inline. + * @param extraMetadata extrametada to be used. + * @return compaction instant if scheduled. + */ + protected Option inlineScheduleCompaction(Option> extraMetadata) { + return scheduleCompaction(extraMetadata); + } + /** * Schedules a new clustering instant. * @param extraMetadata Extra Metadata to be stored @@ -1093,7 +1158,13 @@ public Option scheduleTableService(String instantTime, Option scheduleTableServiceInternal(String instantTime, Option> extraMetadata, TableServiceType tableServiceType) { + if (!tableServicesEnabled(config)) { + return Option.empty(); + } switch (tableServiceType) { + case ARCHIVE: + LOG.info("Scheduling archiving is not supported. Skipping."); + return Option.empty(); case CLUSTER: LOG.info("Scheduling clustering at instant time :" + instantTime); Option clusteringPlan = createTable(config, hadoopConf, config.isMetadataTableEnabled()) @@ -1116,9 +1187,10 @@ private Option scheduleTableServiceInternal(String instantTime, Option inlineCluster(Option> extraMetadata) { - Option clusteringInstantOpt = scheduleClustering(extraMetadata); + protected Option inlineClustering(Option> extraMetadata) { + Option clusteringInstantOpt = inlineScheduleClustering(extraMetadata); clusteringInstantOpt.ifPresent(clusteringInstant -> { // inline cluster should auto commit as the user is never given control cluster(clusteringInstant, true); @@ -1126,6 +1198,15 @@ protected Option inlineCluster(Option> extraMetadata return clusteringInstantOpt; } + /** + * Schedules clustering inline. + * @param extraMetadata extrametadata to use. + * @return clustering instant if scheduled. + */ + protected Option inlineScheduleClustering(Option> extraMetadata) { + return scheduleClustering(extraMetadata); + } + protected void rollbackInflightClustering(HoodieInstant inflightInstant, HoodieTable table) { String commitTime = HoodieActiveTimeline.createNewInstantTime(); table.scheduleRollback(context, commitTime, inflightInstant, false, config.shouldRollbackUsingMarkers()); @@ -1160,7 +1241,7 @@ public HoodieMetrics getMetrics() { return metrics; } - public HoodieIndex getIndex() { + public HoodieIndex getIndex() { return index; } @@ -1208,7 +1289,8 @@ protected void releaseResources() { @Override public void close() { - // release AsyncCleanerService + AsyncArchiveService.forceShutdown(asyncArchiveService); + asyncArchiveService = null; AsyncCleanerService.forceShutdown(asyncCleanerService); asyncCleanerService = null; // Stop timeline-server if running diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/CompactionAdminClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/CompactionAdminClient.java index 1c869e46f1cbf..40e8f85a3ac70 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/CompactionAdminClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/CompactionAdminClient.java @@ -61,7 +61,7 @@ /** * Client to perform admin operations related to compaction. */ -public class CompactionAdminClient extends AbstractHoodieClient { +public class CompactionAdminClient extends BaseHoodieClient { private static final Logger LOG = LogManager.getLogger(CompactionAdminClient.class); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java similarity index 97% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java index 138e40a90c6e0..15401c0292e14 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/HoodieTimelineArchiver.java @@ -7,19 +7,18 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ -package org.apache.hudi.table; +package org.apache.hudi.client; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; import org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan; import org.apache.hudi.client.utils.MetadataConversionUtils; @@ -52,11 +51,14 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.marker.WriteMarkers; import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -79,9 +81,9 @@ /** * Archiver to bound the growth of files under .hoodie meta path. */ -public class HoodieTimelineArchiveLog { +public class HoodieTimelineArchiver { - private static final Logger LOG = LogManager.getLogger(HoodieTimelineArchiveLog.class); + private static final Logger LOG = LogManager.getLogger(HoodieTimelineArchiver.class); private final Path archiveFilePath; private final HoodieWriteConfig config; @@ -91,7 +93,7 @@ public class HoodieTimelineArchiveLog { private final HoodieTable table; private final HoodieTableMetaClient metaClient; - public HoodieTimelineArchiveLog(HoodieWriteConfig config, HoodieTable table) { + public HoodieTimelineArchiver(HoodieWriteConfig config, HoodieTable table) { this.config = config; this.table = table; this.metaClient = table.getMetaClient(); @@ -319,8 +321,7 @@ public void mergeArchiveFiles(List compactCandidate) throws IOExcept // Read the avro blocks while (reader.hasNext()) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); - List recordsPerFile = blk.getRecords(); - records.addAll(recordsPerFile); + blk.getRecordItr().forEachRemaining(records::add); if (records.size() >= this.config.getCommitArchivalBatchSize()) { writeToFile(wrapperSchema, records); } @@ -427,7 +428,7 @@ private Stream getInstantsToArchive() { .collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(), HoodieInstant.getComparableAction(i.getAction())))); - // If metadata table is enabled, do not archive instants which are more recent that the last compaction on the + // If metadata table is enabled, do not archive instants which are more recent than the last compaction on the // metadata table. if (config.isMetadataTableEnabled()) { try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(), @@ -445,7 +446,7 @@ private Stream getInstantsToArchive() { throw new HoodieException("Error limiting instant archival based on metadata table", e); } } - + return instants.flatMap(hoodieInstant -> groupByTsAction.get(Pair.of(hoodieInstant.getTimestamp(), HoodieInstant.getComparableAction(hoodieInstant.getAction()))).stream()); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/RunsTableService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/RunsTableService.java new file mode 100644 index 0000000000000..64e540568e8dc --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/RunsTableService.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client; + +import org.apache.hudi.config.HoodieWriteConfig; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +public interface RunsTableService { + + Logger LOG = LogManager.getLogger(RunsTableService.class); + + default boolean tableServicesEnabled(HoodieWriteConfig config) { + boolean enabled = config.areTableServicesEnabled(); + if (!enabled) { + LOG.warn(String.format("Table services are disabled. Set `%s` to enable.", HoodieWriteConfig.TABLE_SERVICES_ENABLED)); + } + return enabled; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineService.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineService.java index 20f9b75a910a5..72f8e29c9fa8e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineService.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/embedded/EmbeddedTimelineService.java @@ -112,8 +112,12 @@ public FileSystemViewStorageConfig getRemoteFileSystemViewConfig() { FileSystemViewStorageType viewStorageType = writeConfig.getClientSpecifiedViewStorageConfig() .shouldEnableBackupForRemoteFileSystemView() ? FileSystemViewStorageType.REMOTE_FIRST : FileSystemViewStorageType.REMOTE_ONLY; - return FileSystemViewStorageConfig.newBuilder().withStorageType(viewStorageType) - .withRemoteServerHost(hostAddr).withRemoteServerPort(serverPort).build(); + return FileSystemViewStorageConfig.newBuilder() + .withStorageType(viewStorageType) + .withRemoteServerHost(hostAddr) + .withRemoteServerPort(serverPort) + .withRemoteTimelineClientTimeoutSecs(writeConfig.getClientSpecifiedViewStorageConfig().getRemoteTimelineClientTimeoutSecs()) + .build(); } public FileSystemViewManager getViewManager() { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/InProcessLockProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/InProcessLockProvider.java index 426d1cfaf4020..9e5a2379c4c93 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/InProcessLockProvider.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/InProcessLockProvider.java @@ -19,13 +19,14 @@ package org.apache.hudi.client.transaction.lock; -import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.config.LockConfiguration; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.lock.LockProvider; import org.apache.hudi.common.lock.LockState; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.exception.HoodieLockException; + +import org.apache.hadoop.conf.Configuration; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.jetbrains.annotations.NotNull; @@ -92,7 +93,11 @@ public boolean tryLock(long time, @NotNull TimeUnit unit) { public void unlock() { LOG.info(getLogMessage(LockState.RELEASING)); try { - LOCK.writeLock().unlock(); + if (LOCK.isWriteLockedByCurrentThread()) { + LOCK.writeLock().unlock(); + } else { + LOG.warn("Cannot unlock because the current thread does not hold the lock."); + } } catch (Exception e) { throw new HoodieLockException(getLogMessage(LockState.FAILED_TO_RELEASE), e); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java index 773685980af43..913736cad8a91 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/transaction/lock/LockManager.java @@ -83,6 +83,10 @@ public void lock() { } } + /** + * We need to take care of the scenarios that current thread may not be the holder of this lock + * and tries to call unlock() + */ public void unlock() { if (writeConfig.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) { getLockProvider().unlock(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java index ed2ea457764fb..9d7683128fc8c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/utils/TransactionUtils.java @@ -60,9 +60,31 @@ public static Option resolveWriteConflictIfAny( final Option thisCommitMetadata, final HoodieWriteConfig config, Option lastCompletedTxnOwnerInstant) throws HoodieWriteConflictException { + return resolveWriteConflictIfAny(table, currentTxnOwnerInstant, thisCommitMetadata, config, lastCompletedTxnOwnerInstant, false); + } + + /** + * Resolve any write conflicts when committing data. + * + * @param table + * @param currentTxnOwnerInstant + * @param thisCommitMetadata + * @param config + * @param lastCompletedTxnOwnerInstant + * @return + * @throws HoodieWriteConflictException + */ + public static Option resolveWriteConflictIfAny( + final HoodieTable table, + final Option currentTxnOwnerInstant, + final Option thisCommitMetadata, + final HoodieWriteConfig config, + Option lastCompletedTxnOwnerInstant, + boolean reloadActiveTimeline) throws HoodieWriteConflictException { if (config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) { ConflictResolutionStrategy resolutionStrategy = config.getWriteConflictResolutionStrategy(); - Stream instantStream = resolutionStrategy.getCandidateInstants(table.getActiveTimeline(), currentTxnOwnerInstant.get(), lastCompletedTxnOwnerInstant); + Stream instantStream = resolutionStrategy.getCandidateInstants(reloadActiveTimeline + ? table.getMetaClient().reloadActiveTimeline() : table.getActiveTimeline(), currentTxnOwnerInstant.get(), lastCompletedTxnOwnerInstant); final ConcurrentOperation thisOperation = new ConcurrentOperation(currentTxnOwnerInstant.get(), thisCommitMetadata.orElse(new HoodieCommitMetadata())); instantStream.forEach(instant -> { try { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java similarity index 85% rename from hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java index a786e8305bc27..a042255cdcb1a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java @@ -7,13 +7,14 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.common.table.log; @@ -23,6 +24,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodiePayloadConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.storage.HoodieFileReader; @@ -53,10 +55,11 @@ public static HoodieFileSliceReader getFileSliceReader( return new HoodieFileSliceReader(scanner.iterator()); } else { Iterable> iterable = () -> scanner.iterator(); + HoodiePayloadConfig payloadConfig = HoodiePayloadConfig.newBuilder().withPayloadOrderingField(preCombineField).build(); return new HoodieFileSliceReader(StreamSupport.stream(iterable.spliterator(), false) .map(e -> { try { - GenericRecord record = (GenericRecord) e.getData().getInsertValue(schema).get(); + GenericRecord record = (GenericRecord) e.getData().getInsertValue(schema, payloadConfig.getProps()).get(); return transform(record, scanner, payloadClass, preCombineField, simpleKeyGenFieldsOpt); } catch (IOException io) { throw new HoodieIOException("Error while creating reader for file slice with no base file.", io); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java index 057b4a6f61299..41b1812c08151 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.util.TypeUtils; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; @@ -59,7 +60,7 @@ public class HoodieClusteringConfig extends HoodieConfig { "hoodie.clustering.plan.partition.filter.mode"; // Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix - public static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize."; + private static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize."; public static final ConfigProperty DAYBASED_LOOKBACK_PARTITIONS = ConfigProperty .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.lookback.partitions") @@ -83,7 +84,7 @@ public class HoodieClusteringConfig extends HoodieConfig { public static final ConfigProperty PLAN_STRATEGY_SMALL_FILE_LIMIT = ConfigProperty .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit") - .defaultValue(String.valueOf(600 * 1024 * 1024L)) + .defaultValue(String.valueOf(300 * 1024 * 1024L)) .sinceVersion("0.7.0") .withDocumentation("Files smaller than the size specified here are candidates for clustering"); @@ -113,7 +114,8 @@ public class HoodieClusteringConfig extends HoodieConfig { .key("hoodie.clustering.inline") .defaultValue("false") .sinceVersion("0.7.0") - .withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete"); + .withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete") + .withAlternatives("hoodie.datasource.clustering.inline.enable"); public static final ConfigProperty INLINE_CLUSTERING_MAX_COMMITS = ConfigProperty .key("hoodie.clustering.inline.max.commits") @@ -177,11 +179,22 @@ public class HoodieClusteringConfig extends HoodieConfig { .withDocumentation("Determines how to handle updates, deletes to file groups that are under clustering." + " Default strategy just rejects the update"); + public static final ConfigProperty SCHEDULE_INLINE_CLUSTERING = ConfigProperty + .key("hoodie.clustering.schedule.inline") + .defaultValue("false") + .withDocumentation("When set to true, clustering service will be attempted for inline scheduling after each write. Users have to ensure " + + "they have a separate job to run async clustering(execution) for the one scheduled by this writer. Users can choose to set both " + + "`hoodie.clustering.inline` and `hoodie.clustering.schedule.inline` to false and have both scheduling and execution triggered by any async process, on which " + + "case `hoodie.clustering.async.enabled` is expected to be set to true. But if `hoodie.clustering.inline` is set to false, and `hoodie.clustering.schedule.inline` " + + "is set to true, regular writers will schedule clustering inline, but users are expected to trigger async job for execution. If `hoodie.clustering.inline` is set " + + "to true, regular writers will do both scheduling and execution inline for clustering"); + public static final ConfigProperty ASYNC_CLUSTERING_ENABLE = ConfigProperty .key("hoodie.clustering.async.enabled") .defaultValue("false") .sinceVersion("0.7.0") - .withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table."); + .withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.") + .withAlternatives("hoodie.datasource.clustering.async.enable"); public static final ConfigProperty PRESERVE_COMMIT_METADATA = ConfigProperty .key("hoodie.clustering.preserve.commit.metadata") @@ -190,63 +203,88 @@ public class HoodieClusteringConfig extends HoodieConfig { .withDocumentation("When rewriting data, preserves existing hoodie_commit_time"); /** - * Using space-filling curves to optimize the layout of table to boost query performance. - * The table data which sorted by space-filling curve has better aggregation; - * combine with min-max filtering, it can achieve good performance improvement. - * - * Notice: - * when we use this feature, we need specify the sort columns. - * The more columns involved in sorting, the worse the aggregation, and the smaller the query performance improvement. - * Choose the filter columns which commonly used in query sql as sort columns. - * It is recommend that 2 ~ 4 columns participate in sorting. + * @deprecated this setting has no effect. Please refer to clustering configuration, as well as + * {@link #LAYOUT_OPTIMIZE_STRATEGY} config to enable advanced record layout optimization strategies */ public static final ConfigProperty LAYOUT_OPTIMIZE_ENABLE = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "enable") .defaultValue(false) .sinceVersion("0.10.0") - .withDocumentation("Enable use z-ordering/space-filling curves to optimize the layout of table to boost query performance. " - + "This parameter takes precedence over clustering strategy set using " + EXECUTION_STRATEGY_CLASS_NAME.key()); + .deprecatedAfter("0.11.0") + .withDocumentation("This setting has no effect. Please refer to clustering configuration, as well as " + + "LAYOUT_OPTIMIZE_STRATEGY config to enable advanced record layout optimization strategies"); - public static final ConfigProperty LAYOUT_OPTIMIZE_STRATEGY = ConfigProperty + /** + * Determines ordering strategy in for records layout optimization. + * Currently, following strategies are supported + *

    + *
  • Linear: simply orders records lexicographically
  • + *
  • Z-order: orders records along Z-order spatial-curve
  • + *
  • Hilbert: orders records along Hilbert's spatial-curve
  • + *
+ * + * NOTE: "z-order", "hilbert" strategies may consume considerably more compute, than "linear". + * Make sure to perform small-scale local testing for your dataset before applying globally. + */ + public static final ConfigProperty LAYOUT_OPTIMIZE_STRATEGY = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy") - .defaultValue("z-order") + .defaultValue("linear") .sinceVersion("0.10.0") - .withDocumentation("Type of layout optimization to be applied, current only supports `z-order` and `hilbert` curves."); + .withDocumentation("Determines ordering strategy used in records layout optimization. " + + "Currently supported strategies are \"linear\", \"z-order\" and \"hilbert\" values are supported."); /** - * There exists two method to build z-curve. - * one is directly mapping sort cols to z-value to build z-curve; - * we can find this method in Amazon DynamoDB https://aws.amazon.com/cn/blogs/database/tag/z-order/ - * the other one is Boundary-based Interleaved Index method which we proposed. simply call it sample method. - * Refer to rfc-28 for specific algorithm flow. - * Boundary-based Interleaved Index method has better generalization, but the build speed is slower than direct method. + * NOTE: This setting only has effect if {@link #LAYOUT_OPTIMIZE_STRATEGY} value is set to + * either "z-order" or "hilbert" (ie leveraging space-filling curves) + * + * Currently, two methods to order records along the curve are supported "build" and "sample": + * + *
    + *
  • Direct: entails that spatial curve will be built in full, "filling in" all of the individual + * points corresponding to each individual record
  • + *
  • Sample: leverages boundary-base interleaved index method (described in more details in + * Amazon DynamoDB blog [1])
  • + *
+ * + * NOTE: Boundary-based interleaved Index method has better generalization, + * but is slower than direct method. + * + * Please refer to RFC-28 for specific elaboration on both flows. + * + * [1] https://aws.amazon.com/cn/blogs/database/tag/z-order/ */ - public static final ConfigProperty LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD = ConfigProperty + public static final ConfigProperty LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.build.method") .defaultValue("direct") .sinceVersion("0.10.0") - .withDocumentation("Controls how data is sampled to build the space filling curves. two methods: `direct`,`sample`." - + "The direct method is faster than the sampling, however sample method would produce a better data layout."); + .withDocumentation("Controls how data is sampled to build the space-filling curves. " + + "Two methods: \"direct\", \"sample\". The direct method is faster than the sampling, " + + "however sample method would produce a better data layout."); + /** - * Doing sample for table data is the first step in Boundary-based Interleaved Index method. - * larger sample number means better optimize result, but more memory consumption + * NOTE: This setting only has effect if {@link #LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD} value + * is set to "sample" + * + * Determines target sample size used by the Boundary-based Interleaved Index method. + * Larger sample size entails better layout optimization outcomes, at the expense of higher memory + * footprint. */ - public static final ConfigProperty LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE = ConfigProperty + public static final ConfigProperty LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.size") .defaultValue("200000") .sinceVersion("0.10.0") - .withDocumentation("when setting" + LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD.key() + " to `sample`, the amount of sampling to be done." - + "Large sample size leads to better results, at the expense of more memory usage."); + .withDocumentation("Determines target sample size used by the Boundary-based Interleaved Index method " + + "of building space-filling curve. Larger sample size entails better layout optimization outcomes, " + + "at the expense of higher memory footprint."); /** - * The best way to use Z-order/Space-filling curves is to cooperate with Data-Skipping - * with data-skipping query engine can greatly reduce the number of table files to be read. - * otherwise query engine can only do row-group skipping for files (parquet/orc) + * @deprecated this setting has no effect */ public static final ConfigProperty LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable") .defaultValue(true) .sinceVersion("0.10.0") + .deprecatedAfter("0.11.0") .withDocumentation("Enable data skipping by collecting statistics once layout optimization is complete."); public static final ConfigProperty ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT = ConfigProperty @@ -480,6 +518,11 @@ public Builder withInlineClustering(Boolean inlineClustering) { return this; } + public Builder withScheduleInlineClustering(Boolean scheduleInlineClustering) { + clusteringConfig.setValue(SCHEDULE_INLINE_CLUSTERING, String.valueOf(scheduleInlineClustering)); + return this; + } + public Builder withInlineClusteringNumCommits(int numCommits) { clusteringConfig.setValue(INLINE_CLUSTERING_MAX_COMMITS, String.valueOf(numCommits)); return this; @@ -516,18 +559,13 @@ public Builder withRollbackPendingClustering(Boolean rollbackPendingClustering) return this; } - public Builder withSpaceFillingCurveDataOptimizeEnable(Boolean enable) { - clusteringConfig.setValue(LAYOUT_OPTIMIZE_ENABLE, String.valueOf(enable)); - return this; - } - public Builder withDataOptimizeStrategy(String strategy) { clusteringConfig.setValue(LAYOUT_OPTIMIZE_STRATEGY, strategy); return this; } public Builder withDataOptimizeBuildCurveStrategy(String method) { - clusteringConfig.setValue(LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD, method); + clusteringConfig.setValue(LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD, method); return this; } @@ -536,17 +574,18 @@ public Builder withDataOptimizeBuildCurveSampleNumber(int sampleNumber) { return this; } - public Builder withDataOptimizeDataSkippingEnable(boolean dataSkipping) { - clusteringConfig.setValue(LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE, String.valueOf(dataSkipping)); - return this; - } - public HoodieClusteringConfig build() { clusteringConfig.setDefaultValue( PLAN_STRATEGY_CLASS_NAME, getDefaultPlanStrategyClassName(engineType)); clusteringConfig.setDefaultValue( EXECUTION_STRATEGY_CLASS_NAME, getDefaultExecutionStrategyClassName(engineType)); clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName()); + + boolean inlineCluster = clusteringConfig.getBoolean(HoodieClusteringConfig.INLINE_CLUSTERING); + boolean inlineClusterSchedule = clusteringConfig.getBoolean(HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING); + ValidationUtils.checkArgument(!(inlineCluster && inlineClusterSchedule), String.format("Either of inline clustering (%s) or " + + "schedule inline clustering (%s) can be enabled. Both can't be set to true at the same time. %s,%s", HoodieClusteringConfig.INLINE_CLUSTERING.key(), + HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING.key(), inlineCluster, inlineClusterSchedule)); return clusteringConfig; } @@ -578,21 +617,21 @@ private String getDefaultExecutionStrategyClassName(EngineType engineType) { /** * Type of a strategy for building Z-order/Hilbert space-filling curves. */ - public enum BuildCurveStrategyType { + public enum SpatialCurveCompositionStrategyType { DIRECT("direct"), SAMPLE("sample"); - private static final Map VALUE_TO_ENUM_MAP = - TypeUtils.getValueToEnumMap(BuildCurveStrategyType.class, e -> e.value); + private static final Map VALUE_TO_ENUM_MAP = + TypeUtils.getValueToEnumMap(SpatialCurveCompositionStrategyType.class, e -> e.value); private final String value; - BuildCurveStrategyType(String value) { + SpatialCurveCompositionStrategyType(String value) { this.value = value; } - public static BuildCurveStrategyType fromValue(String value) { - BuildCurveStrategyType enumValue = VALUE_TO_ENUM_MAP.get(value); + public static SpatialCurveCompositionStrategyType fromValue(String value) { + SpatialCurveCompositionStrategyType enumValue = VALUE_TO_ENUM_MAP.get(value); if (enumValue == null) { throw new HoodieException(String.format("Invalid value (%s)", value)); } @@ -605,6 +644,7 @@ public static BuildCurveStrategyType fromValue(String value) { * Layout optimization strategies such as Z-order/Hilbert space-curves, etc */ public enum LayoutOptimizationStrategy { + LINEAR("linear"), ZORDER("z-order"), HILBERT("hilbert"); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java index 4d1e197cf8a1e..0aac9308da439 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java @@ -50,13 +50,6 @@ + "cleaning (reclamation of older/unused file groups/slices).") public class HoodieCompactionConfig extends HoodieConfig { - public static final ConfigProperty AUTO_CLEAN = ConfigProperty - .key("hoodie.clean.automatic") - .defaultValue("true") - .withDocumentation("When enabled, the cleaner table service is invoked immediately after each commit," - + " to delete older file slices. It's recommended to enable this, to ensure metadata and data storage" - + " growth is bounded."); - public static final ConfigProperty AUTO_ARCHIVE = ConfigProperty .key("hoodie.archive.automatic") .defaultValue("true") @@ -64,6 +57,20 @@ public class HoodieCompactionConfig extends HoodieConfig { + " to archive commits if we cross a maximum value of commits." + " It's recommended to enable this, to ensure number of active commits is bounded."); + public static final ConfigProperty ASYNC_ARCHIVE = ConfigProperty + .key("hoodie.archive.async") + .defaultValue("false") + .sinceVersion("0.11.0") + .withDocumentation("Only applies when " + AUTO_ARCHIVE.key() + " is turned on. " + + "When turned on runs archiver async with writing, which can speed up overall write performance."); + + public static final ConfigProperty AUTO_CLEAN = ConfigProperty + .key("hoodie.clean.automatic") + .defaultValue("true") + .withDocumentation("When enabled, the cleaner table service is invoked immediately after each commit," + + " to delete older file slices. It's recommended to enable this, to ensure metadata and data storage" + + " growth is bounded."); + public static final ConfigProperty ASYNC_CLEAN = ConfigProperty .key("hoodie.clean.async") .defaultValue("false") @@ -76,6 +83,12 @@ public class HoodieCompactionConfig extends HoodieConfig { .withDocumentation("Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits " + "(scheduled). This also directly translates into how much data retention the table supports for incremental queries."); + public static final ConfigProperty CLEANER_HOURS_RETAINED = ConfigProperty.key("hoodie.cleaner.hours.retained") + .defaultValue("24") + .withDocumentation("Number of hours for which commits need to be retained. This config provides a more flexible option as" + + "compared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group," + + " corresponding to commits with commit times older than the configured number of hours to be retained are cleaned."); + public static final ConfigProperty CLEANER_POLICY = ConfigProperty .key("hoodie.cleaner.policy") .defaultValue(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name()) @@ -90,6 +103,16 @@ public class HoodieCompactionConfig extends HoodieConfig { .withDocumentation("When set to true, compaction service is triggered after each write. While being " + " simpler operationally, this adds extra latency on the write path."); + public static final ConfigProperty SCHEDULE_INLINE_COMPACT = ConfigProperty + .key("hoodie.compact.schedule.inline") + .defaultValue("false") + .withDocumentation("When set to true, compaction service will be attempted for inline scheduling after each write. Users have to ensure " + + "they have a separate job to run async compaction(execution) for the one scheduled by this writer. Users can choose to set both " + + "`hoodie.compact.inline` and `hoodie.compact.schedule.inline` to false and have both scheduling and execution triggered by any async process. " + + "But if `hoodie.compact.inline` is set to false, and `hoodie.compact.schedule.inline` is set to true, regular writers will schedule compaction inline, " + + "but users are expected to trigger async job for execution. If `hoodie.compact.inline` is set to true, regular writers will do both scheduling and " + + "execution inline for compaction"); + public static final ConfigProperty INLINE_COMPACT_NUM_DELTA_COMMITS = ConfigProperty .key("hoodie.compact.inline.max.delta.commits") .defaultValue("5") @@ -156,7 +179,8 @@ public class HoodieCompactionConfig extends HoodieConfig { .defaultValue(String.valueOf(104857600)) .withDocumentation("During upsert operation, we opportunistically expand existing small files on storage, instead of writing" + " new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage " - + " becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file."); + + " becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file." + + " Also note that if this set <= 0, will not try to get small files and directly write new files"); public static final ConfigProperty RECORD_SIZE_ESTIMATION_THRESHOLD = ConfigProperty .key("hoodie.record.size.estimation.threshold") @@ -200,7 +224,7 @@ public class HoodieCompactionConfig extends HoodieConfig { public static final ConfigProperty COMPACTION_LAZY_BLOCK_READ_ENABLE = ConfigProperty .key("hoodie.compaction.lazy.block.read") - .defaultValue("false") + .defaultValue("true") .withDocumentation("When merging the delta log files, this config helps to choose whether the log blocks " + "should be read lazily or not. Choose true to use lazy block reading (low memory usage, but incurs seeks to each block" + " header) or false for immediate block read (higher memory usage)"); @@ -254,6 +278,13 @@ public class HoodieCompactionConfig extends HoodieConfig { .withDocumentation("The average record size. If not explicitly specified, hudi will compute the " + "record size estimate compute dynamically based on commit metadata. " + " This is critical in computing the insert parallelism and bin-packing inserts into small files."); + + public static final ConfigProperty ALLOW_MULTIPLE_CLEANS = ConfigProperty + .key("hoodie.clean.allow.multiple") + .defaultValue(true) + .sinceVersion("0.11.0") + .withDocumentation("Allows scheduling/executing multiple cleans by enabling this config. If users prefer to strictly ensure clean requests should be mutually exclusive, " + + ".i.e. a 2nd clean will not be scheduled if another clean is not yet completed to avoid repeat cleaning of same files, they might want to disable this config."); public static final ConfigProperty ARCHIVE_MERGE_FILES_BATCH_SIZE = ConfigProperty .key("hoodie.archive.merge.files.batch.size") @@ -512,6 +543,16 @@ public Builder fromProperties(Properties props) { return this; } + public Builder withAutoArchive(Boolean autoArchive) { + compactionConfig.setValue(AUTO_ARCHIVE, String.valueOf(autoArchive)); + return this; + } + + public Builder withAsyncArchive(Boolean asyncArchive) { + compactionConfig.setValue(ASYNC_ARCHIVE, String.valueOf(asyncArchive)); + return this; + } + public Builder withAutoClean(Boolean autoClean) { compactionConfig.setValue(AUTO_CLEAN, String.valueOf(autoClean)); return this; @@ -522,11 +563,6 @@ public Builder withAsyncClean(Boolean asyncClean) { return this; } - public Builder withAutoArchive(Boolean autoArchive) { - compactionConfig.setValue(AUTO_ARCHIVE, String.valueOf(autoArchive)); - return this; - } - public Builder withIncrementalCleaningMode(Boolean incrementalCleaningMode) { compactionConfig.setValue(CLEANER_INCREMENTAL_MODE_ENABLE, String.valueOf(incrementalCleaningMode)); return this; @@ -537,6 +573,11 @@ public Builder withInlineCompaction(Boolean inlineCompaction) { return this; } + public Builder withScheduleInlineCompaction(Boolean scheduleAsyncCompaction) { + compactionConfig.setValue(SCHEDULE_INLINE_COMPACT, String.valueOf(scheduleAsyncCompaction)); + return this; + } + public Builder withInlineCompactionTriggerStrategy(CompactionTriggerStrategy compactionTriggerStrategy) { compactionConfig.setValue(INLINE_COMPACT_TRIGGER_STRATEGY, compactionTriggerStrategy.name()); return this; @@ -557,6 +598,11 @@ public Builder retainCommits(int commitsRetained) { return this; } + public Builder cleanerNumHoursRetained(int cleanerHoursRetained) { + compactionConfig.setValue(CLEANER_HOURS_RETAINED, String.valueOf(cleanerHoursRetained)); + return this; + } + public Builder archiveCommitsWith(int minToKeep, int maxToKeep) { compactionConfig.setValue(MIN_COMMITS_TO_KEEP, String.valueOf(minToKeep)); compactionConfig.setValue(MAX_COMMITS_TO_KEEP, String.valueOf(maxToKeep)); @@ -603,6 +649,11 @@ public Builder approxRecordSize(int recordSizeEstimate) { return this; } + public Builder allowMultipleCleans(boolean allowMultipleCleanSchedules) { + compactionConfig.setValue(ALLOW_MULTIPLE_CLEANS, String.valueOf(allowMultipleCleanSchedules)); + return this; + } + public Builder withCleanerParallelism(int cleanerParallelism) { compactionConfig.setValue(CLEANER_PARALLELISM_VALUE, String.valueOf(cleanerParallelism)); return this; @@ -700,6 +751,12 @@ public HoodieCompactionConfig build() { + "missing data from few instants.", HoodieCompactionConfig.MIN_COMMITS_TO_KEEP.key(), minInstantsToKeep, HoodieCompactionConfig.CLEANER_COMMITS_RETAINED.key(), cleanerCommitsRetained)); + + boolean inlineCompact = compactionConfig.getBoolean(HoodieCompactionConfig.INLINE_COMPACT); + boolean inlineCompactSchedule = compactionConfig.getBoolean(HoodieCompactionConfig.SCHEDULE_INLINE_COMPACT); + ValidationUtils.checkArgument(!(inlineCompact && inlineCompactSchedule), String.format("Either of inline compaction (%s) or " + + "schedule inline compaction (%s) can be enabled. Both can't be set to true at the same time. %s, %s", HoodieCompactionConfig.INLINE_COMPACT.key(), + HoodieCompactionConfig.SCHEDULE_INLINE_COMPACT.key(), inlineCompact, inlineCompactSchedule)); return compactionConfig; } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java index e156310c736b0..f82f14d5a9c64 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieIndexConfig.java @@ -152,7 +152,7 @@ public class HoodieIndexConfig extends HoodieConfig { public static final ConfigProperty SIMPLE_INDEX_PARALLELISM = ConfigProperty .key("hoodie.simple.index.parallelism") - .defaultValue("50") + .defaultValue("100") .withDocumentation("Only applies if index type is SIMPLE. " + "This is the amount of parallelism for index lookup, which involves a Spark Shuffle"); @@ -568,7 +568,7 @@ public HoodieIndexConfig build() { private String getDefaultIndexType(EngineType engineType) { switch (engineType) { case SPARK: - return HoodieIndex.IndexType.BLOOM.name(); + return HoodieIndex.IndexType.SIMPLE.name(); case FLINK: case JAVA: return HoodieIndex.IndexType.INMEMORY.name(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java index 42689ec18e948..6447a039cc069 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieStorageConfig.java @@ -83,14 +83,17 @@ public class HoodieStorageConfig extends HoodieConfig { .withDocumentation("Lower values increase the size of metadata tracked within HFile, but can offer potentially " + "faster lookup times."); - // used to size log files + public static final ConfigProperty LOGFILE_DATA_BLOCK_FORMAT = ConfigProperty + .key("hoodie.logfile.data.block.format") + .noDefaultValue() + .withDocumentation("Format of the data block within delta logs. Following formats are currently supported \"avro\", \"hfile\", \"parquet\""); + public static final ConfigProperty LOGFILE_MAX_SIZE = ConfigProperty .key("hoodie.logfile.max.size") .defaultValue(String.valueOf(1024 * 1024 * 1024)) // 1 GB .withDocumentation("LogFile max size. This is the maximum size allowed for a log file " + "before it is rolled over to the next version."); - // used to size data blocks in log file public static final ConfigProperty LOGFILE_DATA_BLOCK_MAX_SIZE = ConfigProperty .key("hoodie.logfile.data.block.max.size") .defaultValue(String.valueOf(256 * 1024 * 1024)) @@ -124,7 +127,7 @@ public class HoodieStorageConfig extends HoodieConfig { public static final ConfigProperty PARQUET_OUTPUT_TIMESTAMP_TYPE = ConfigProperty .key("hoodie.parquet.outputtimestamptype") - .defaultValue("TIMESTAMP_MILLIS") + .defaultValue("TIMESTAMP_MICROS") .withDocumentation("Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files."); public static final ConfigProperty HFILE_COMPRESSION_ALGORITHM_NAME = ConfigProperty diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 5896ac49ea69d..b7b410817b2fd 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -31,15 +31,18 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.fs.FileSystemRetryConfig; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.marker.MarkerType; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.metrics.HoodieMetricsConfig; @@ -59,9 +62,9 @@ import org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode; import org.apache.hudi.table.action.compact.CompactionTriggerStrategy; import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; +import org.apache.hudi.table.storage.HoodieStorageLayout; import org.apache.hadoop.hbase.io.compress.Compression; -import org.apache.hudi.table.storage.HoodieStorageLayout; import org.apache.orc.CompressionKind; import org.apache.parquet.hadoop.metadata.CompressionCodecName; @@ -438,7 +441,20 @@ public class HoodieWriteConfig extends HoodieConfig { .sinceVersion("0.10.0") .withDocumentation("File Id Prefix provider class, that implements `org.apache.hudi.fileid.FileIdPrefixProvider`"); + public static final ConfigProperty TABLE_SERVICES_ENABLED = ConfigProperty + .key("hoodie.table.services.enabled") + .defaultValue(true) + .sinceVersion("0.11.0") + .withDocumentation("Master control to disable all table services including archive, clean, compact, cluster, etc."); + + public static final ConfigProperty RELEASE_RESOURCE_ENABLE = ConfigProperty + .key("hoodie.release.resource.on.completion.enable") + .defaultValue(true) + .sinceVersion("0.11.0") + .withDocumentation("Control to enable release all persist rdds when the spark job finish."); + private ConsistencyGuardConfig consistencyGuardConfig; + private FileSystemRetryConfig fileSystemRetryConfig; // Hoodie Write Client transparently rewrites File System View config when embedded mode is enabled // We keep track of original config and rewritten config @@ -832,6 +848,7 @@ protected HoodieWriteConfig(EngineType engineType, Properties props) { newProps.putAll(props); this.engineType = engineType; this.consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().fromProperties(newProps).build(); + this.fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().fromProperties(newProps).build(); this.clientSpecifiedViewStorageConfig = FileSystemViewStorageConfig.newBuilder().fromProperties(newProps).build(); this.viewStorageConfig = clientSpecifiedViewStorageConfig; this.hoodiePayloadConfig = HoodiePayloadConfig.newBuilder().fromProperties(newProps).build(); @@ -1074,6 +1091,10 @@ public int getCleanerCommitsRetained() { return getInt(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED); } + public int getCleanerHoursRetained() { + return getInt(HoodieCompactionConfig.CLEANER_HOURS_RETAINED); + } + public int getMaxCommitsToKeep() { return getInt(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP); } @@ -1102,6 +1123,10 @@ public int getCopyOnWriteRecordSizeEstimate() { return getInt(HoodieCompactionConfig.COPY_ON_WRITE_RECORD_SIZE_ESTIMATE); } + public boolean allowMultipleCleans() { + return getBoolean(HoodieCompactionConfig.ALLOW_MULTIPLE_CLEANS); + } + public boolean shouldAutoTuneInsertSplits() { return getBoolean(HoodieCompactionConfig.COPY_ON_WRITE_AUTO_SPLIT_INSERTS); } @@ -1110,10 +1135,6 @@ public int getCleanerParallelism() { return getInt(HoodieCompactionConfig.CLEANER_PARALLELISM_VALUE); } - public boolean isAutoClean() { - return getBoolean(HoodieCompactionConfig.AUTO_CLEAN); - } - public boolean getArchiveMergeEnable() { return getBoolean(HoodieCompactionConfig.ARCHIVE_MERGE_ENABLE); } @@ -1126,6 +1147,14 @@ public boolean isAutoArchive() { return getBoolean(HoodieCompactionConfig.AUTO_ARCHIVE); } + public boolean isAsyncArchive() { + return getBoolean(HoodieCompactionConfig.ASYNC_ARCHIVE); + } + + public boolean isAutoClean() { + return getBoolean(HoodieCompactionConfig.AUTO_CLEAN); + } + public boolean isAsyncClean() { return getBoolean(HoodieCompactionConfig.ASYNC_CLEAN); } @@ -1138,6 +1167,10 @@ public boolean inlineCompactionEnabled() { return getBoolean(HoodieCompactionConfig.INLINE_COMPACT); } + public boolean scheduleInlineCompaction() { + return getBoolean(HoodieCompactionConfig.SCHEDULE_INLINE_COMPACT); + } + public CompactionTriggerStrategy getInlineCompactTriggerStrategy() { return CompactionTriggerStrategy.valueOf(getString(HoodieCompactionConfig.INLINE_COMPACT_TRIGGER_STRATEGY)); } @@ -1178,6 +1211,10 @@ public boolean inlineClusteringEnabled() { return getBoolean(HoodieClusteringConfig.INLINE_CLUSTERING); } + public boolean scheduleInlineClustering() { + return getBoolean(HoodieClusteringConfig.SCHEDULE_INLINE_CLUSTERING); + } + public boolean isAsyncClusteringEnabled() { return getBoolean(HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE); } @@ -1288,30 +1325,21 @@ public String getClusteringSortColumns() { return getString(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS); } - /** - * Data layout optimize properties. - */ - public boolean isLayoutOptimizationEnabled() { - return getBoolean(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE); + public HoodieClusteringConfig.LayoutOptimizationStrategy getLayoutOptimizationStrategy() { + return HoodieClusteringConfig.LayoutOptimizationStrategy.fromValue( + getStringOrDefault(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY) + ); } - public String getLayoutOptimizationStrategy() { - return getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY); - } - - public HoodieClusteringConfig.BuildCurveStrategyType getLayoutOptimizationCurveBuildMethod() { - return HoodieClusteringConfig.BuildCurveStrategyType.fromValue( - getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD)); + public HoodieClusteringConfig.SpatialCurveCompositionStrategyType getLayoutOptimizationCurveBuildMethod() { + return HoodieClusteringConfig.SpatialCurveCompositionStrategyType.fromValue( + getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD)); } public int getLayoutOptimizationSampleSize() { return getInt(HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE); } - public boolean isDataSkippingEnabled() { - return getBoolean(HoodieClusteringConfig.LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE); - } - /** * index properties. */ @@ -1444,6 +1472,14 @@ public boolean useBloomIndexBucketizedChecking() { return getBoolean(HoodieIndexConfig.BLOOM_INDEX_BUCKETIZED_CHECKING); } + public boolean isMetadataBloomFilterIndexEnabled() { + return isMetadataTableEnabled() && getMetadataConfig().isBloomFilterIndexEnabled(); + } + + public boolean isMetadataIndexColumnStatsForAllColumnsEnabled() { + return isMetadataTableEnabled() && getMetadataConfig().isMetadataColumnStatsIndexForAllColumnsEnabled(); + } + public int getBloomIndexKeysPerBucket() { return getInt(HoodieIndexConfig.BLOOM_INDEX_KEYS_PER_BUCKET); } @@ -1515,6 +1551,11 @@ public String parquetOutputTimestampType() { return getString(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE); } + public Option getLogDataBlockFormat() { + return Option.ofNullable(getString(HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT)) + .map(HoodieLogBlock.HoodieLogBlockType::fromId); + } + public long getLogFileMaxSize() { return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE); } @@ -1681,7 +1722,7 @@ public boolean getPushGatewayRandomJobNameSuffix() { public String getMetricReporterMetricsNamePrefix() { return getStringOrDefault(HoodieMetricsConfig.METRICS_REPORTER_PREFIX); } - + /** * memory configs. */ @@ -1701,6 +1742,10 @@ public ConsistencyGuardConfig getConsistencyGuardConfig() { return consistencyGuardConfig; } + public FileSystemRetryConfig getFileSystemRetryConfig() { + return fileSystemRetryConfig; + } + public void setConsistencyGuardConfig(ConsistencyGuardConfig consistencyGuardConfig) { this.consistencyGuardConfig = consistencyGuardConfig; } @@ -1853,12 +1898,12 @@ public WriteConcurrencyMode getWriteConcurrencyMode() { } /** - * Are any table services configured to run inline? + * Are any table services configured to run inline for both scheduling and execution? * * @return True if any table services are configured to run inline, false otherwise. */ - public Boolean areAnyTableServicesInline() { - return inlineClusteringEnabled() || inlineCompactionEnabled() || isAutoClean(); + public Boolean areAnyTableServicesExecutedInline() { + return inlineClusteringEnabled() || inlineCompactionEnabled() || isAutoClean() || isAutoArchive(); } /** @@ -1867,7 +1912,11 @@ public Boolean areAnyTableServicesInline() { * @return True if any table services are configured to run async, false otherwise. */ public Boolean areAnyTableServicesAsync() { - return isAsyncClusteringEnabled() || !inlineCompactionEnabled() || isAsyncClean(); + return isAsyncClusteringEnabled() || !inlineCompactionEnabled() || isAsyncClean() || isAsyncArchive(); + } + + public Boolean areAnyTableServicesScheduledInline() { + return scheduleInlineCompaction() || scheduleInlineClustering(); } public String getPreCommitValidators() { @@ -1898,6 +1947,14 @@ public String getFileIdPrefixProviderClassName() { return getString(FILEID_PREFIX_PROVIDER_CLASS); } + public boolean areTableServicesEnabled() { + return getBooleanOrDefault(TABLE_SERVICES_ENABLED); + } + + public boolean areReleaseResourceEnabled() { + return getBooleanOrDefault(RELEASE_RESOURCE_ENABLE); + } + /** * Layout configs. */ @@ -2263,6 +2320,16 @@ public Builder withFileIdPrefixProviderClassName(String fileIdPrefixProviderClas return this; } + public Builder withTableServicesEnabled(boolean enabled) { + writeConfig.setValue(TABLE_SERVICES_ENABLED, Boolean.toString(enabled)); + return this; + } + + public Builder withReleaseResourceEnabled(boolean enabled) { + writeConfig.setValue(RELEASE_RESOURCE_ENABLE, Boolean.toString(enabled)); + return this; + } + public Builder withProperties(Properties properties) { this.writeConfig.getProps().putAll(properties); return this; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieRestoreException.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieRestoreException.java index c6c9076f51bae..baad53aba5941 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieRestoreException.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/exception/HoodieRestoreException.java @@ -23,4 +23,8 @@ public class HoodieRestoreException extends HoodieException { public HoodieRestoreException(String msg, Throwable e) { super(msg, e); } + + public HoodieRestoreException(String msg) { + super(msg); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/HoodieLazyInsertIterable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/HoodieLazyInsertIterable.java index 0d28c74e13f9b..b078076b864f5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/HoodieLazyInsertIterable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/execution/HoodieLazyInsertIterable.java @@ -18,7 +18,6 @@ package org.apache.hudi.execution; -import java.util.Properties; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.utils.LazyIterableIterator; import org.apache.hudi.common.engine.TaskContextSupplier; @@ -36,6 +35,7 @@ import java.util.Iterator; import java.util.List; +import java.util.Properties; import java.util.function.Function; /** @@ -87,7 +87,7 @@ public static class HoodieInsertValueGenResult { public HoodieInsertValueGenResult(T record, Schema schema, Properties properties) { this.record = record; try { - this.insertValue = record.getData().getInsertValue(schema, properties); + this.insertValue = ((HoodieRecordPayload) record.getData()).getInsertValue(schema, properties); } catch (Exception e) { this.exception = Option.of(e); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java index 0428d12c40306..922371c4a0f45 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndex.java @@ -27,7 +27,6 @@ import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; @@ -39,13 +38,11 @@ /** * Base class for different types of indexes to determine the mapping from uuid. * - * @param Sub type of HoodieRecordPayload * @param Type of inputs for deprecated APIs - * @param Type of keys for deprecated APIs * @param Type of outputs for deprecated APIs */ @PublicAPIClass(maturity = ApiMaturityLevel.EVOLVING) -public abstract class HoodieIndex implements Serializable { +public abstract class HoodieIndex implements Serializable { protected final HoodieWriteConfig config; @@ -60,7 +57,7 @@ protected HoodieIndex(HoodieWriteConfig config) { @Deprecated @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public I tagLocation(I records, HoodieEngineContext context, - HoodieTable hoodieTable) throws HoodieIndexException { + HoodieTable hoodieTable) throws HoodieIndexException { throw new HoodieNotSupportedException("Deprecated API should not be called"); } @@ -70,7 +67,7 @@ public I tagLocation(I records, HoodieEngineContext context, @Deprecated @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public O updateLocation(O writeStatuses, HoodieEngineContext context, - HoodieTable hoodieTable) throws HoodieIndexException { + HoodieTable hoodieTable) throws HoodieIndexException { throw new HoodieNotSupportedException("Deprecated API should not be called"); } @@ -79,8 +76,8 @@ public O updateLocation(O writeStatuses, HoodieEngineContext context, * the row (if it is actually present). */ @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) - public abstract HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public abstract HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) throws HoodieIndexException; /** @@ -144,6 +141,6 @@ public void close() { } public enum IndexType { - HBASE, INMEMORY, BLOOM, GLOBAL_BLOOM, SIMPLE, GLOBAL_SIMPLE, BUCKET + HBASE, INMEMORY, BLOOM, GLOBAL_BLOOM, SIMPLE, GLOBAL_SIMPLE, BUCKET, FLINK_STATE } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java index e5426ca1161f9..b714c50334b4f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java @@ -18,17 +18,30 @@ package org.apache.hudi.index; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Set; +import java.util.TreeSet; import static java.util.stream.Collectors.toList; @@ -37,6 +50,8 @@ */ public class HoodieIndexUtils { + private static final Logger LOG = LogManager.getLogger(HoodieIndexUtils.class); + /** * Fetches Pair of partition path and {@link HoodieBaseFile}s for interested partitions. * @@ -87,18 +102,48 @@ public static List> getLatestBaseFilesForAllPartiti * @return the tagged {@link HoodieRecord} */ public static HoodieRecord getTaggedRecord(HoodieRecord inputRecord, Option location) { - HoodieRecord record = inputRecord; + HoodieRecord record = inputRecord; if (location.isPresent()) { // When you have a record in multiple files in the same partition, then collection // will have 2 entries with the same exact in memory copy of the HoodieRecord and the 2 // separate filenames that the record is found in. This will result in setting // currentLocation 2 times and it will fail the second time. So creating a new in memory // copy of the hoodie record. - record = new HoodieRecord<>(inputRecord); + record = inputRecord.newInstance(); record.unseal(); record.setCurrentLocation(location.get()); record.seal(); } return record; } + + /** + * Given a list of row keys and one file, return only row keys existing in that file. + * + * @param filePath - File to filter keys from + * @param candidateRecordKeys - Candidate keys to filter + * @return List of candidate keys that are available in the file + */ + public static List filterKeysFromFile(Path filePath, List candidateRecordKeys, + Configuration configuration) throws HoodieIndexException { + ValidationUtils.checkArgument(FSUtils.isBaseFile(filePath)); + List foundRecordKeys = new ArrayList<>(); + try { + // Load all rowKeys from the file, to double-confirm + if (!candidateRecordKeys.isEmpty()) { + HoodieTimer timer = new HoodieTimer().startTimer(); + HoodieFileReader fileReader = HoodieFileReaderFactory.getFileReader(configuration, filePath); + Set fileRowKeys = fileReader.filterRowKeys(new TreeSet<>(candidateRecordKeys)); + foundRecordKeys.addAll(fileRowKeys); + LOG.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath, + timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size())); + if (LOG.isDebugEnabled()) { + LOG.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); + } + } + } catch (Exception e) { + throw new HoodieIndexException("Error checking candidate keys against file.", e); + } + return foundRecordKeys; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java index 9f0e815632f38..9430d9bb5e50b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/BaseHoodieBloomIndexHelper.java @@ -24,7 +24,7 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.util.collection.ImmutablePair; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; @@ -51,7 +51,7 @@ public abstract class BaseHoodieBloomIndexHelper implements Serializable { public abstract HoodiePairData findMatchingFilesForRecordKeys( HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, HoodiePairData partitionRecordKeyPairs, - HoodieData> fileComparisonPairs, + HoodieData> fileComparisonPairs, Map> partitionToFileInfo, Map recordsPerPartition); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndexCheckFunction.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndexCheckFunction.java index 441a212c59f40..80031f4e8f025 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndexCheckFunction.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBaseBloomIndexCheckFunction.java @@ -25,7 +25,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.io.HoodieKeyLookupHandle; -import org.apache.hudi.io.HoodieKeyLookupHandle.KeyLookupResult; +import org.apache.hudi.io.HoodieKeyLookupResult; import org.apache.hudi.table.HoodieTable; import java.util.function.Function; @@ -37,7 +37,7 @@ * Function performing actual checking of list containing (fileId, hoodieKeys) against the actual files. */ public class HoodieBaseBloomIndexCheckFunction - implements Function>, Iterator>> { + implements Function>, Iterator>> { private final HoodieTable hoodieTable; @@ -49,11 +49,11 @@ public HoodieBaseBloomIndexCheckFunction(HoodieTable hoodieTable, HoodieWriteCon } @Override - public Iterator> apply(Iterator> filePartitionRecordKeyTripletItr) { + public Iterator> apply(Iterator> filePartitionRecordKeyTripletItr) { return new LazyKeyCheckIterator(filePartitionRecordKeyTripletItr); } - class LazyKeyCheckIterator extends LazyIterableIterator, List> { + class LazyKeyCheckIterator extends LazyIterableIterator, List> { private HoodieKeyLookupHandle keyLookupHandle; @@ -66,8 +66,8 @@ protected void start() { } @Override - protected List computeNext() { - List ret = new ArrayList<>(); + protected List computeNext() { + List ret = new ArrayList<>(); try { // process one file in each go. while (inputItr.hasNext()) { @@ -83,7 +83,7 @@ protected List computeNext() { } // if continue on current file - if (keyLookupHandle.getPartitionPathFilePair().equals(partitionPathFilePair)) { + if (keyLookupHandle.getPartitionPathFileIDPair().equals(partitionPathFilePair)) { keyLookupHandle.addKey(recordKey); } else { // do the actual checking of file & break out diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java index a223d695cc03a..d3e73c058cc56 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java @@ -19,20 +19,22 @@ package org.apache.hudi.index.bloom; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.data.HoodiePairData; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.exception.MetadataNotFoundException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndexUtils; @@ -46,6 +48,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import java.util.stream.Stream; import static java.util.stream.Collectors.groupingBy; import static java.util.stream.Collectors.mapping; @@ -55,8 +58,7 @@ /** * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata. */ -public class HoodieBloomIndex> - extends HoodieIndex { +public class HoodieBloomIndex extends HoodieIndex { private static final Logger LOG = LogManager.getLogger(HoodieBloomIndex.class); private final BaseHoodieBloomIndexHelper bloomIndexHelper; @@ -67,8 +69,8 @@ public HoodieBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelper blo } @Override - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) { // Step 0: cache the input records if needed if (config.getBloomIndexUseCaching()) { @@ -94,7 +96,7 @@ record -> new ImmutablePair<>(record.getPartitionPath(), record.getRecordKey())) } // Step 3: Tag the incoming records, as inserts or updates, by joining with existing record keys - HoodieData> taggedRecords = tagLocationBacktoRecords(keyFilenamePairs, records); + HoodieData> taggedRecords = tagLocationBacktoRecords(keyFilenamePairs, records); if (config.getBloomIndexUseCaching()) { records.unpersist(); @@ -111,19 +113,25 @@ record -> new ImmutablePair<>(record.getPartitionPath(), record.getRecordKey())) private HoodiePairData lookupIndex( HoodiePairData partitionRecordKeyPairs, final HoodieEngineContext context, final HoodieTable hoodieTable) { - // Obtain records per partition, in the incoming records + // Step 1: Obtain records per partition, in the incoming records Map recordsPerPartition = partitionRecordKeyPairs.countByKey(); List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); // Step 2: Load all involved files as pairs - List> fileInfoList = - loadInvolvedFiles(affectedPartitionPathList, context, hoodieTable); + List> fileInfoList; + if (config.getBloomIndexPruneByRanges()) { + fileInfoList = (config.getMetadataConfig().isColumnStatsIndexEnabled() + ? loadColumnRangesFromMetaIndex(affectedPartitionPathList, context, hoodieTable) + : loadColumnRangesFromFiles(affectedPartitionPathList, context, hoodieTable)); + } else { + fileInfoList = getFileInfoForLatestBaseFiles(affectedPartitionPathList, context, hoodieTable); + } final Map> partitionToFileInfo = fileInfoList.stream().collect(groupingBy(Pair::getLeft, mapping(Pair::getRight, toList()))); // Step 3: Obtain a HoodieData, for each incoming record, that already exists, with the file id, // that contains it. - HoodieData> fileComparisonPairs = + HoodieData> fileComparisonPairs = explodeRecordsWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairs); return bloomIndexHelper.findMatchingFilesForRecordKeys(config, context, hoodieTable, @@ -133,30 +141,84 @@ private HoodiePairData lookupIndex( /** * Load all involved files as pair List. */ - List> loadInvolvedFiles( + List> loadColumnRangesFromFiles( List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { // Obtain the latest data files from all the partitions. List> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream() .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) .collect(toList()); - if (config.getBloomIndexPruneByRanges()) { - // also obtain file ranges, if range pruning is enabled - context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)"); - return context.map(partitionPathFileIDList, pf -> { - try { - HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); - String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); - return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); - } catch (MetadataNotFoundException me) { - LOG.warn("Unable to find range metadata in file :" + pf); - return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); + context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)"); + return context.map(partitionPathFileIDList, pf -> { + try { + HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); + String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); + return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); + } catch (MetadataNotFoundException me) { + LOG.warn("Unable to find range metadata in file :" + pf); + return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); + } + }, Math.max(partitionPathFileIDList.size(), 1)); + } + + /** + * Get BloomIndexFileInfo for all the latest base files for the requested partitions. + * + * @param partitions - List of partitions to get the base files for + * @param context - Engine context + * @param hoodieTable - Hoodie Table + * @return List of partition and file column range info pairs + */ + private List> getFileInfoForLatestBaseFiles( + List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { + List> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, + hoodieTable).stream() + .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) + .collect(toList()); + return partitionPathFileIDList.stream() + .map(pf -> Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); + } + + /** + * Load the column stats index as BloomIndexFileInfo for all the involved files in the partition. + * + * @param partitions - List of partitions for which column stats need to be loaded + * @param context - Engine context + * @param hoodieTable - Hoodie table + * @return List of partition and file column range info pairs + */ + protected List> loadColumnRangesFromMetaIndex( + List partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { + // also obtain file ranges, if range pruning is enabled + context.setJobStatus(this.getClass().getName(), "Load meta index key ranges for file slices"); + + final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp(); + return context.flatMap(partitions, partitionName -> { + // Partition and file name pairs + List> partitionFileNameList = HoodieIndexUtils.getLatestBaseFilesForPartition(partitionName, + hoodieTable).stream().map(baseFile -> Pair.of(partitionName, baseFile.getFileName())) + .sorted() + .collect(toList()); + if (partitionFileNameList.isEmpty()) { + return Stream.empty(); + } + try { + Map, HoodieMetadataColumnStats> fileToColumnStatsMap = hoodieTable + .getMetadataTable().getColumnStats(partitionFileNameList, keyField); + List> result = new ArrayList<>(); + for (Map.Entry, HoodieMetadataColumnStats> entry : fileToColumnStatsMap.entrySet()) { + result.add(Pair.of(entry.getKey().getLeft(), + new BloomIndexFileInfo( + FSUtils.getFileId(entry.getKey().getRight()), + entry.getValue().getMinValue(), + entry.getValue().getMaxValue() + ))); } - }, Math.max(partitionPathFileIDList.size(), 1)); - } else { - return partitionPathFileIDList.stream() - .map(pf -> Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); - } + return result.stream(); + } catch (MetadataNotFoundException me) { + throw new HoodieMetadataException("Unable to find column range metadata for partition:" + partitionName, me); + } + }, Math.max(partitions.size(), 1)); } @Override @@ -197,7 +259,7 @@ public boolean isImplicitWithStorage() { * Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on * recordKey ranges in the index info. */ - HoodieData> explodeRecordsWithFileComparisons( + HoodieData> explodeRecordsWithFileComparisons( final Map> partitionToFileIndexInfo, HoodiePairData partitionRecordKeyPairs) { IndexFileFilter indexFileFilter = @@ -209,7 +271,7 @@ HoodieData> explodeRecordsWithFileComparisons( String partitionPath = partitionRecordKeyPair.getLeft(); return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() - .map(partitionFileIdPair -> new ImmutablePair<>(partitionFileIdPair.getRight(), + .map(partitionFileIdPair -> (Pair) new ImmutablePair<>(partitionFileIdPair.getRight(), new HoodieKey(recordKey, partitionPath))) .collect(Collectors.toList()); }).flatMap(List::iterator); @@ -218,10 +280,10 @@ HoodieData> explodeRecordsWithFileComparisons( /** * Tag the back to the original HoodieRecord List. */ - protected HoodieData> tagLocationBacktoRecords( + protected HoodieData> tagLocationBacktoRecords( HoodiePairData keyFilenamePair, - HoodieData> records) { - HoodiePairData> keyRecordPairs = + HoodieData> records) { + HoodiePairData> keyRecordPairs = records.mapToPair(record -> new ImmutablePair<>(record.getKey(), record)); // Here as the records might have more data than keyFilenamePairs (some row keys' fileId is null), // so we do left outer join. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java index 39fa72a329fe3..5f2007ea53668 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieGlobalBloomIndex.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -46,7 +47,7 @@ * This filter will only work with hoodie table since it will only load partitions * with .hoodie_partition_metadata file in it. */ -public class HoodieGlobalBloomIndex> extends HoodieBloomIndex { +public class HoodieGlobalBloomIndex extends HoodieBloomIndex { public HoodieGlobalBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelper bloomIndexHelper) { super(config, bloomIndexHelper); } @@ -55,11 +56,11 @@ public HoodieGlobalBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelp * Load all involved files as pairs from all partitions in the table. */ @Override - List> loadInvolvedFiles(List partitions, final HoodieEngineContext context, - final HoodieTable hoodieTable) { + List> loadColumnRangesFromFiles(List partitions, final HoodieEngineContext context, + final HoodieTable hoodieTable) { HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); List allPartitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath()); - return super.loadInvolvedFiles(allPartitionPaths, context, hoodieTable); + return super.loadColumnRangesFromFiles(allPartitionPaths, context, hoodieTable); } /** @@ -73,7 +74,7 @@ List> loadInvolvedFiles(List partitions */ @Override - HoodieData> explodeRecordsWithFileComparisons( + HoodieData> explodeRecordsWithFileComparisons( final Map> partitionToFileIndexInfo, HoodiePairData partitionRecordKeyPairs) { @@ -86,7 +87,7 @@ HoodieData> explodeRecordsWithFileComparisons( String partitionPath = partitionRecordKeyPair.getLeft(); return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream() - .map(partitionFileIdPair -> new ImmutablePair<>(partitionFileIdPair.getRight(), + .map(partitionFileIdPair -> (Pair) new ImmutablePair<>(partitionFileIdPair.getRight(), new HoodieKey(recordKey, partitionFileIdPair.getLeft()))) .collect(Collectors.toList()); }).flatMap(List::iterator); @@ -96,11 +97,11 @@ HoodieData> explodeRecordsWithFileComparisons( * Tagging for global index should only consider the record key. */ @Override - protected HoodieData> tagLocationBacktoRecords( + protected HoodieData> tagLocationBacktoRecords( HoodiePairData keyLocationPairs, - HoodieData> records) { + HoodieData> records) { - HoodiePairData> incomingRowKeyRecordPairs = + HoodiePairData> incomingRowKeyRecordPairs = records.mapToPair(record -> new ImmutablePair<>(record.getRecordKey(), record)); HoodiePairData> existingRecordKeyToRecordLocationHoodieKeyMap = @@ -109,29 +110,29 @@ protected HoodieData> tagLocationBacktoRecords( // Here as the records might have more data than rowKeys (some rowKeys' fileId is null), so we do left outer join. return incomingRowKeyRecordPairs.leftOuterJoin(existingRecordKeyToRecordLocationHoodieKeyMap).values().flatMap(record -> { - final HoodieRecord hoodieRecord = record.getLeft(); + final HoodieRecord hoodieRecord = record.getLeft(); final Option> recordLocationHoodieKeyPair = record.getRight(); if (recordLocationHoodieKeyPair.isPresent()) { // Record key matched to file if (config.getBloomIndexUpdatePartitionPath() && !recordLocationHoodieKeyPair.get().getRight().getPartitionPath().equals(hoodieRecord.getPartitionPath())) { // Create an empty record to delete the record in the old partition - HoodieRecord deleteRecord = new HoodieRecord(recordLocationHoodieKeyPair.get().getRight(), + HoodieRecord deleteRecord = new HoodieAvroRecord(recordLocationHoodieKeyPair.get().getRight(), new EmptyHoodieRecordPayload()); deleteRecord.setCurrentLocation(recordLocationHoodieKeyPair.get().getLeft()); deleteRecord.seal(); // Tag the incoming record for inserting to the new partition - HoodieRecord insertRecord = HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty()); + HoodieRecord insertRecord = HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty()); return Arrays.asList(deleteRecord, insertRecord).iterator(); } else { // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not. // When it differs, the record will still be updated at its old partition. return Collections.singletonList( - (HoodieRecord) HoodieIndexUtils.getTaggedRecord(new HoodieRecord<>(recordLocationHoodieKeyPair.get().getRight(), hoodieRecord.getData()), + (HoodieRecord) HoodieIndexUtils.getTaggedRecord(new HoodieAvroRecord(recordLocationHoodieKeyPair.get().getRight(), (HoodieRecordPayload) hoodieRecord.getData()), Option.ofNullable(recordLocationHoodieKeyPair.get().getLeft()))).iterator(); } } else { - return Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty())).iterator(); + return Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty())).iterator(); } }); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java index 74191df523659..c42d80c62e758 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/ListBasedHoodieBloomIndexHelper.java @@ -28,7 +28,7 @@ import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.io.HoodieKeyLookupHandle; +import org.apache.hudi.io.HoodieKeyLookupResult; import org.apache.hudi.table.HoodieTable; import java.util.ArrayList; @@ -57,15 +57,14 @@ public static ListBasedHoodieBloomIndexHelper getInstance() { public HoodiePairData findMatchingFilesForRecordKeys( HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, HoodiePairData partitionRecordKeyPairs, - HoodieData> fileComparisonPairs, + HoodieData> fileComparisonPairs, Map> partitionToFileInfo, Map recordsPerPartition) { List> fileComparisonPairList = HoodieList.getList(fileComparisonPairs).stream() - .sorted(Comparator.comparing(ImmutablePair::getLeft)).collect(toList()); + .sorted(Comparator.comparing(Pair::getLeft)).collect(toList()); - List keyLookupResults = new ArrayList<>(); - - Iterator> iterator = new HoodieBaseBloomIndexCheckFunction( + List keyLookupResults = new ArrayList<>(); + Iterator> iterator = new HoodieBaseBloomIndexCheckFunction( hoodieTable, config).apply(fileComparisonPairList.iterator()); while (iterator.hasNext()) { keyLookupResults.addAll(iterator.next()); @@ -77,7 +76,7 @@ public HoodiePairData findMatchingFilesForRecor lookupResult.getMatchingRecordKeys().stream() .map(recordKey -> new ImmutablePair<>(lookupResult, recordKey)).iterator() ).mapToPair(pair -> { - HoodieKeyLookupHandle.KeyLookupResult lookupResult = pair.getLeft(); + HoodieKeyLookupResult lookupResult = pair.getLeft(); String recordKey = pair.getRight(); return new ImmutablePair<>( new HoodieKey(recordKey, lookupResult.getPartitionPath()), diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIdentifier.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIdentifier.java index 7dee9f3cdfa33..ddd95721a46b6 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIdentifier.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/BucketIdentifier.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.model.HoodieRecord; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.regex.Pattern; @@ -39,7 +40,7 @@ public static int getBucketId(HoodieRecord record, String indexKeyFields, int nu public static int getBucketId(HoodieKey hoodieKey, String indexKeyFields, int numBuckets) { List hashKeyFields; if (!hoodieKey.getRecordKey().contains(":")) { - hashKeyFields = Arrays.asList(hoodieKey.getRecordKey()); + hashKeyFields = Collections.singletonList(hoodieKey.getRecordKey()); } else { Map recordKeyPairs = Arrays.stream(hoodieKey.getRecordKey().split(",")) .map(p -> p.split(":")) @@ -56,6 +57,10 @@ public static int getBucketId(List hashKeyFields, int numBuckets) { return hashKeyFields.hashCode() % numBuckets; } + public static String partitionBucketIdStr(String partition, int bucketId) { + return String.format("%s_%s", partition, bucketIdStr(bucketId)); + } + public static int bucketIdFromFileId(String fileId) { return Integer.parseInt(fileId.substring(0, 8)); } @@ -64,6 +69,10 @@ public static String bucketIdStr(int n) { return String.format("%08d", n); } + public static String newBucketFileIdPrefix(int bucketId) { + return newBucketFileIdPrefix(bucketIdStr(bucketId)); + } + public static String newBucketFileIdPrefix(String bucketId) { return FSUtils.createNewFileIdPfx().replaceFirst(".{8}", bucketId); } @@ -71,4 +80,8 @@ public static String newBucketFileIdPrefix(String bucketId) { public static boolean isBucketFileName(String name) { return BUCKET_NAME.matcher(name).matches(); } + + public static int mod(int x, int y) { + return x % y; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieBucketIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieBucketIndex.java index acb06ea48bed1..a243eea767856 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieBucketIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bucket/HoodieBucketIndex.java @@ -24,7 +24,6 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; @@ -34,6 +33,7 @@ import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndexUtils; import org.apache.hudi.table.HoodieTable; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -42,10 +42,8 @@ /** * Hash indexing mechanism. - * @param */ -public class HoodieBucketIndex> - extends HoodieIndex { +public class HoodieBucketIndex extends HoodieIndex { private static final Logger LOG = LogManager.getLogger(HoodieBucketIndex.class); @@ -66,14 +64,14 @@ public HoodieData updateLocation(HoodieData writeStatu } @Override - public HoodieData> tagLocation(HoodieData> records, - HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) throws HoodieIndexException { - HoodieData> taggedRecords = records.mapPartitions(recordIter -> { + HoodieData> taggedRecords = records.mapPartitions(recordIter -> { // partitionPath -> bucketId -> fileInfo Map>> partitionPathFileIDList = new HashMap<>(); - return new LazyIterableIterator, HoodieRecord>(recordIter) { + return new LazyIterableIterator, HoodieRecord>(recordIter) { @Override protected void start() { @@ -81,7 +79,7 @@ protected void start() { } @Override - protected HoodieRecord computeNext() { + protected HoodieRecord computeNext() { HoodieRecord record = recordIter.next(); int bucketId = BucketIdentifier.getBucketId(record, config.getBucketIndexHashField(), numBuckets); String partitionPath = record.getPartitionPath(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java index bec675c102ff5..42dcc1b97d760 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/inmemory/HoodieInMemoryHashIndex.java @@ -25,7 +25,6 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; @@ -41,8 +40,8 @@ *

* ONLY USE FOR LOCAL TESTING */ -public class HoodieInMemoryHashIndex> - extends HoodieIndex { +public class HoodieInMemoryHashIndex + extends HoodieIndex { private static ConcurrentMap recordLocationMap; @@ -56,13 +55,13 @@ public HoodieInMemoryHashIndex(HoodieWriteConfig config) { } @Override - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) { return records.mapPartitions(hoodieRecordIterator -> { - List> taggedRecords = new ArrayList<>(); + List> taggedRecords = new ArrayList<>(); while (hoodieRecordIterator.hasNext()) { - HoodieRecord record = hoodieRecordIterator.next(); + HoodieRecord record = hoodieRecordIterator.next(); if (recordLocationMap.containsKey(record.getKey())) { record.unseal(); record.setCurrentLocation(recordLocationMap.get(record.getKey())); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java index 8935fcb02fec2..805ae462a1128 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieGlobalSimpleIndex.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -47,17 +48,15 @@ /** * A global simple index which reads interested fields(record key and partition path) from base files and * joins with incoming records to find the tagged location. - * - * @param */ -public class HoodieGlobalSimpleIndex> extends HoodieSimpleIndex { +public class HoodieGlobalSimpleIndex extends HoodieSimpleIndex { public HoodieGlobalSimpleIndex(HoodieWriteConfig config, Option keyGeneratorOpt) { super(config, keyGeneratorOpt); } @Override - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) { return tagLocationInternal(records, context, hoodieTable); } @@ -71,11 +70,11 @@ public HoodieData> tagLocation( * @return {@link HoodieData} of records with record locations set */ @Override - protected HoodieData> tagLocationInternal( - HoodieData> inputRecords, HoodieEngineContext context, + protected HoodieData> tagLocationInternal( + HoodieData> inputRecords, HoodieEngineContext context, HoodieTable hoodieTable) { - HoodiePairData> keyedInputRecords = + HoodiePairData> keyedInputRecords = inputRecords.mapToPair(entry -> new ImmutablePair<>(entry.getRecordKey(), entry)); HoodiePairData allRecordLocationsInTable = fetchAllRecordLocations(context, hoodieTable, config.getGlobalSimpleIndexParallelism()); @@ -114,8 +113,8 @@ protected List> getAllBaseFilesInTable( * @param existingRecords existing records with {@link HoodieRecordLocation}s * @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s */ - private HoodieData> getTaggedRecords( - HoodiePairData> incomingRecords, + private HoodieData> getTaggedRecords( + HoodiePairData> incomingRecords, HoodiePairData existingRecords) { HoodiePairData> existingRecordByRecordKey = existingRecords.mapToPair( @@ -124,29 +123,29 @@ private HoodieData> getTaggedRecords( return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values() .flatMap(entry -> { - HoodieRecord inputRecord = entry.getLeft(); + HoodieRecord inputRecord = entry.getLeft(); Option> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null)); - List> taggedRecords; + List> taggedRecords; if (partitionPathLocationPair.isPresent()) { String partitionPath = partitionPathLocationPair.get().getKey(); HoodieRecordLocation location = partitionPathLocationPair.get().getRight(); if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) { // Create an empty record to delete the record in the old partition - HoodieRecord deleteRecord = new HoodieRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload()); + HoodieRecord deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload()); deleteRecord.setCurrentLocation(location); deleteRecord.seal(); // Tag the incoming record for inserting to the new partition - HoodieRecord insertRecord = (HoodieRecord) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()); + HoodieRecord insertRecord = (HoodieRecord) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()); taggedRecords = Arrays.asList(deleteRecord, insertRecord); } else { // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not. // When it differs, the record will still be updated at its old partition. - HoodieRecord newRecord = new HoodieRecord<>(new HoodieKey(inputRecord.getRecordKey(), partitionPath), inputRecord.getData()); - taggedRecords = Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location))); + HoodieRecord newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData()); + taggedRecords = Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location))); } } else { - taggedRecords = Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty())); + taggedRecords = Collections.singletonList((HoodieRecord) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty())); } return taggedRecords.iterator(); }); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java index dfefe5adabfe9..95823ff51e35d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/simple/HoodieSimpleIndex.java @@ -28,7 +28,6 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; @@ -47,11 +46,9 @@ /** * A simple index which reads interested fields(record key and partition path) from base files and * joins with incoming records to find the tagged location. - * - * @param type of {@link HoodieRecordPayload} */ -public class HoodieSimpleIndex> - extends HoodieIndex { +public class HoodieSimpleIndex + extends HoodieIndex { private final Option keyGeneratorOpt; @@ -88,8 +85,8 @@ public boolean isImplicitWithStorage() { } @Override - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) { return tagLocationInternal(records, context, hoodieTable); } @@ -102,23 +99,23 @@ public HoodieData> tagLocation( * @param hoodieTable instance of {@link HoodieTable} to use * @return {@link HoodieData} of records with record locations set */ - protected HoodieData> tagLocationInternal( - HoodieData> inputRecords, HoodieEngineContext context, + protected HoodieData> tagLocationInternal( + HoodieData> inputRecords, HoodieEngineContext context, HoodieTable hoodieTable) { if (config.getSimpleIndexUseCaching()) { inputRecords.persist(new HoodieConfig(config.getProps()) .getString(HoodieIndexConfig.SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE)); } - HoodiePairData> keyedInputRecords = + HoodiePairData> keyedInputRecords = inputRecords.mapToPair(record -> new ImmutablePair<>(record.getKey(), record)); HoodiePairData existingLocationsOnTable = fetchRecordLocationsForAffectedPartitions(keyedInputRecords.keys(), context, hoodieTable, config.getSimpleIndexParallelism()); - HoodieData> taggedRecords = + HoodieData> taggedRecords = keyedInputRecords.leftOuterJoin(existingLocationsOnTable).map(entry -> { - final HoodieRecord untaggedRecord = entry.getRight().getLeft(); + final HoodieRecord untaggedRecord = entry.getRight().getLeft(); final Option location = Option.ofNullable(entry.getRight().getRight().orElse(null)); return HoodieIndexUtils.getTaggedRecord(untaggedRecord, location); }); @@ -151,7 +148,7 @@ protected HoodiePairData fetchRecordLocationsFo protected HoodiePairData fetchRecordLocations( HoodieEngineContext context, HoodieTable hoodieTable, int parallelism, List> baseFiles) { - int fetchParallelism = Math.max(1, Math.max(baseFiles.size(), parallelism)); + int fetchParallelism = Math.max(1, Math.min(baseFiles.size(), parallelism)); return context.parallelize(baseFiles, fetchParallelism) .flatMap(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile, keyGeneratorOpt) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java index 6df05a7c6bd72..7eafe268ba8e8 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java @@ -38,10 +38,12 @@ import org.apache.hudi.common.table.log.AppendResult; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; -import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; +import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; import org.apache.hudi.common.util.DefaultSizeEstimator; import org.apache.hudi.common.util.Option; @@ -49,6 +51,7 @@ import org.apache.hudi.common.util.SizeEstimator; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieAppendException; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.table.HoodieTable; @@ -360,13 +363,13 @@ protected void appendDataAndDeleteBlocks(Map header) header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchemaWithMetaFields.toString()); List blocks = new ArrayList<>(2); if (recordList.size() > 0) { - if (config.populateMetaFields()) { - blocks.add(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header)); - } else { - final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp(); - blocks.add(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header, keyField)); - } + String keyField = config.populateMetaFields() + ? HoodieRecord.RECORD_KEY_METADATA_FIELD + : hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp(); + + blocks.add(getBlock(config, pickLogDataBlockFormat(), recordList, header, keyField)); } + if (keysToDelete.size() > 0) { blocks.add(new HoodieDeleteBlock(keysToDelete.toArray(new HoodieKey[keysToDelete.size()]), header)); } @@ -390,7 +393,7 @@ public boolean canWrite(HoodieRecord record) { @Override public void write(HoodieRecord record, Option insertValue) { - Option> recordMetadata = record.getData().getMetadata(); + Option> recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata(); try { init(record); flushToDiskIfRequired(record); @@ -497,4 +500,40 @@ private void flushToDiskIfRequired(HoodieRecord record) { numberOfRecords = 0; } } + + private HoodieLogBlock.HoodieLogBlockType pickLogDataBlockFormat() { + Option logBlockTypeOpt = config.getLogDataBlockFormat(); + if (logBlockTypeOpt.isPresent()) { + return logBlockTypeOpt.get(); + } + + // Fallback to deduce data-block type based on the base file format + switch (hoodieTable.getBaseFileFormat()) { + case PARQUET: + case ORC: + return HoodieLogBlock.HoodieLogBlockType.AVRO_DATA_BLOCK; + case HFILE: + return HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK; + default: + throw new HoodieException("Base file format " + hoodieTable.getBaseFileFormat() + + " does not have associated log block type"); + } + } + + private static HoodieLogBlock getBlock(HoodieWriteConfig writeConfig, + HoodieLogBlock.HoodieLogBlockType logDataBlockFormat, + List recordList, + Map header, + String keyField) { + switch (logDataBlockFormat) { + case AVRO_DATA_BLOCK: + return new HoodieAvroDataBlock(recordList, header, keyField); + case HFILE_DATA_BLOCK: + return new HoodieHFileDataBlock(recordList, header, writeConfig.getHFileCompressionAlgorithm()); + case PARQUET_DATA_BLOCK: + return new HoodieParquetDataBlock(recordList, header, keyField, writeConfig.getParquetCompressionCodec()); + default: + throw new HoodieException("Data block format " + logDataBlockFormat + " not implemented"); + } + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java index a9ff1f85478cb..096c257b1f797 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java @@ -18,7 +18,6 @@ package org.apache.hudi.io; -import org.apache.avro.Schema; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.fs.FSUtils; @@ -37,6 +36,7 @@ import org.apache.hudi.io.storage.HoodieFileWriterFactory; import org.apache.hudi.table.HoodieTable; +import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; @@ -128,7 +128,7 @@ public boolean canWrite(HoodieRecord record) { */ @Override public void write(HoodieRecord record, Option avroRecord) { - Option recordMetadata = record.getData().getMetadata(); + Option recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata(); if (HoodieOperation.isDelete(record.getOperation())) { avroRecord = Option.empty(); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieIOHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieIOHandle.java index c6f9dddef30db..1ad28d14b3a8d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieIOHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieIOHandle.java @@ -19,6 +19,8 @@ package org.apache.hudi.io; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; @@ -31,8 +33,8 @@ public abstract class HoodieIOHandle { protected final FileSystem fs; protected final HoodieTable hoodieTable; - HoodieIOHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable) { - this.instantTime = instantTime; + HoodieIOHandle(HoodieWriteConfig config, Option instantTime, HoodieTable hoodieTable) { + this.instantTime = instantTime.orElse(StringUtils.EMPTY_STRING); this.config = config; this.hoodieTable = hoodieTable; this.fs = getFileSystem(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLocationFetchHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLocationFetchHandle.java index c33931f503a36..ab8b83c14aeec 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLocationFetchHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLocationFetchHandle.java @@ -47,7 +47,7 @@ public class HoodieKeyLocationFetchHandle hoodieTable, Pair partitionPathBaseFilePair, Option keyGeneratorOpt) { - super(config, null, hoodieTable, Pair.of(partitionPathBaseFilePair.getLeft(), partitionPathBaseFilePair.getRight().getFileId())); + super(config, hoodieTable, Pair.of(partitionPathBaseFilePair.getLeft(), partitionPathBaseFilePair.getRight().getFileId())); this.partitionPathBaseFilePair = partitionPathBaseFilePair; this.keyGeneratorOpt = keyGeneratorOpt; } @@ -57,9 +57,9 @@ public Stream> locations() { BaseFileUtils baseFileUtils = BaseFileUtils.getInstance(baseFile.getPath()); List hoodieKeyList = new ArrayList<>(); if (keyGeneratorOpt.isPresent()) { - hoodieKeyList = baseFileUtils.fetchRecordKeyPartitionPath(hoodieTable.getHadoopConf(), new Path(baseFile.getPath()), keyGeneratorOpt); + hoodieKeyList = baseFileUtils.fetchHoodieKeys(hoodieTable.getHadoopConf(), new Path(baseFile.getPath()), keyGeneratorOpt); } else { - hoodieKeyList = baseFileUtils.fetchRecordKeyPartitionPath(hoodieTable.getHadoopConf(), new Path(baseFile.getPath())); + hoodieKeyList = baseFileUtils.fetchHoodieKeys(hoodieTable.getHadoopConf(), new Path(baseFile.getPath())); } return hoodieKeyList.stream() .map(entry -> Pair.of(entry, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java index ad84e3e974af8..12d075e0cb532 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupHandle.java @@ -19,25 +19,30 @@ package org.apache.hudi.io; import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; +import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.util.HoodieTimer; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.table.HoodieTable; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; /** * Takes a bunch of keys and returns ones that are present in the file group. @@ -46,52 +51,58 @@ public class HoodieKeyLookupHandle exten private static final Logger LOG = LogManager.getLogger(HoodieKeyLookupHandle.class); - private final HoodieTableType tableType; - private final BloomFilter bloomFilter; - private final List candidateRecordKeys; - + private final boolean useMetadataTableIndex; + private Option fileName = Option.empty(); private long totalKeysChecked; public HoodieKeyLookupHandle(HoodieWriteConfig config, HoodieTable hoodieTable, - Pair partitionPathFilePair) { - super(config, null, hoodieTable, partitionPathFilePair); - this.tableType = hoodieTable.getMetaClient().getTableType(); + Pair partitionPathFileIDPair) { + this(config, hoodieTable, partitionPathFileIDPair, Option.empty(), false); + } + + public HoodieKeyLookupHandle(HoodieWriteConfig config, HoodieTable hoodieTable, + Pair partitionPathFileIDPair, Option fileName, + boolean useMetadataTableIndex) { + super(config, hoodieTable, partitionPathFileIDPair); this.candidateRecordKeys = new ArrayList<>(); this.totalKeysChecked = 0; - HoodieTimer timer = new HoodieTimer().startTimer(); - - try { - this.bloomFilter = createNewFileReader().readBloomFilter(); - } catch (IOException e) { - throw new HoodieIndexException(String.format("Error reading bloom filter from %s: %s", partitionPathFilePair, e)); + if (fileName.isPresent()) { + ValidationUtils.checkArgument(FSUtils.getFileId(fileName.get()).equals(getFileId()), + "File name '" + fileName.get() + "' doesn't match this lookup handle fileid '" + getFileId() + "'"); + this.fileName = fileName; } - LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFilePair, timer.endTimer())); + this.useMetadataTableIndex = useMetadataTableIndex; + this.bloomFilter = getBloomFilter(); } - /** - * Given a list of row keys and one file, return only row keys existing in that file. - */ - public List checkCandidatesAgainstFile(Configuration configuration, List candidateRecordKeys, - Path filePath) throws HoodieIndexException { - List foundRecordKeys = new ArrayList<>(); + private BloomFilter getBloomFilter() { + BloomFilter bloomFilter = null; + HoodieTimer timer = new HoodieTimer().startTimer(); try { - // Load all rowKeys from the file, to double-confirm - if (!candidateRecordKeys.isEmpty()) { - HoodieTimer timer = new HoodieTimer().startTimer(); - Set fileRowKeys = createNewFileReader().filterRowKeys(new HashSet<>(candidateRecordKeys)); - foundRecordKeys.addAll(fileRowKeys); - LOG.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath, - timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size())); - if (LOG.isDebugEnabled()) { - LOG.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); + if (this.useMetadataTableIndex) { + ValidationUtils.checkArgument(this.fileName.isPresent(), + "File name not available to fetch bloom filter from the metadata table index."); + Option bloomFilterByteBuffer = + hoodieTable.getMetadataTable().getBloomFilter(partitionPathFileIDPair.getLeft(), fileName.get()); + if (!bloomFilterByteBuffer.isPresent()) { + throw new HoodieIndexException("BloomFilter missing for " + partitionPathFileIDPair.getRight()); + } + bloomFilter = + new HoodieDynamicBoundedBloomFilter(StandardCharsets.UTF_8.decode(bloomFilterByteBuffer.get()).toString(), + BloomFilterTypeCode.DYNAMIC_V0); + } else { + try (HoodieFileReader reader = createNewFileReader()) { + bloomFilter = reader.readBloomFilter(); } } - } catch (Exception e) { - throw new HoodieIndexException("Error checking candidate keys against file.", e); + } catch (IOException e) { + throw new HoodieIndexException(String.format("Error reading bloom filter from %s/%s - %s", + getPartitionPathFileIDPair().getLeft(), this.fileName, e)); } - return foundRecordKeys; + LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFileIDPair, timer.endTimer())); + return bloomFilter; } /** @@ -101,7 +112,7 @@ public void addKey(String recordKey) { // check record key against bloom filter of current file & add to possible keys if needed if (bloomFilter.mightContain(recordKey)) { if (LOG.isDebugEnabled()) { - LOG.debug("Record key " + recordKey + " matches bloom filter in " + partitionPathFilePair); + LOG.debug("Record key " + recordKey + " matches bloom filter in " + partitionPathFileIDPair); } candidateRecordKeys.add(recordKey); } @@ -111,53 +122,18 @@ public void addKey(String recordKey) { /** * Of all the keys, that were added, return a list of keys that were actually found in the file group. */ - public KeyLookupResult getLookupResult() { + public HoodieKeyLookupResult getLookupResult() { if (LOG.isDebugEnabled()) { - LOG.debug("#The candidate row keys for " + partitionPathFilePair + " => " + candidateRecordKeys); + LOG.debug("#The candidate row keys for " + partitionPathFileIDPair + " => " + candidateRecordKeys); } HoodieBaseFile dataFile = getLatestDataFile(); - List matchingKeys = - checkCandidatesAgainstFile(hoodieTable.getHadoopConf(), candidateRecordKeys, new Path(dataFile.getPath())); + List matchingKeys = HoodieIndexUtils.filterKeysFromFile(new Path(dataFile.getPath()), candidateRecordKeys, + hoodieTable.getHadoopConf()); LOG.info( String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", totalKeysChecked, candidateRecordKeys.size(), candidateRecordKeys.size() - matchingKeys.size(), matchingKeys.size())); - return new KeyLookupResult(partitionPathFilePair.getRight(), partitionPathFilePair.getLeft(), + return new HoodieKeyLookupResult(partitionPathFileIDPair.getRight(), partitionPathFileIDPair.getLeft(), dataFile.getCommitTime(), matchingKeys); } - - /** - * Encapsulates the result from a key lookup. - */ - public static class KeyLookupResult { - - private final String fileId; - private final String baseInstantTime; - private final List matchingRecordKeys; - private final String partitionPath; - - public KeyLookupResult(String fileId, String partitionPath, String baseInstantTime, - List matchingRecordKeys) { - this.fileId = fileId; - this.partitionPath = partitionPath; - this.baseInstantTime = baseInstantTime; - this.matchingRecordKeys = matchingRecordKeys; - } - - public String getFileId() { - return fileId; - } - - public String getBaseInstantTime() { - return baseInstantTime; - } - - public String getPartitionPath() { - return partitionPath; - } - - public List getMatchingRecordKeys() { - return matchingRecordKeys; - } - } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupResult.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupResult.java new file mode 100644 index 0000000000000..19096a21d8700 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieKeyLookupResult.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +import java.util.List; + +/** + * Encapsulates the result from a key lookup. + */ +public class HoodieKeyLookupResult { + + private final String fileId; + private final String baseInstantTime; + private final List matchingRecordKeys; + private final String partitionPath; + + public HoodieKeyLookupResult(String fileId, String partitionPath, String baseInstantTime, + List matchingRecordKeys) { + this.fileId = fileId; + this.partitionPath = partitionPath; + this.baseInstantTime = baseInstantTime; + this.matchingRecordKeys = matchingRecordKeys; + } + + public String getFileId() { + return fileId; + } + + public String getBaseInstantTime() { + return baseInstantTime; + } + + public String getPartitionPath() { + return partitionPath; + } + + public List getMatchingRecordKeys() { + return matchingRecordKeys; + } +} + diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java index 87a8d133f0dd5..32d4ec2a6d794 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java @@ -324,7 +324,7 @@ public void write(GenericRecord oldRecord) { if (keyToNewRecords.containsKey(key)) { // If we have duplicate records that we are updating, then the hoodie record will be deflated after // writing the first record. So make a copy of the record to be merged - HoodieRecord hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key)); + HoodieRecord hoodieRecord = keyToNewRecords.get(key).newInstance(); try { Option combinedAvroRecord = hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java index 78fa9be690367..abe4a9befef9b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieRangeInfoHandle.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.table.HoodieTable; import java.io.IOException; @@ -32,10 +33,12 @@ public class HoodieRangeInfoHandle exten public HoodieRangeInfoHandle(HoodieWriteConfig config, HoodieTable hoodieTable, Pair partitionPathFilePair) { - super(config, null, hoodieTable, partitionPathFilePair); + super(config, hoodieTable, partitionPathFilePair); } public String[] getMinMaxKeys() throws IOException { - return createNewFileReader().readMinMaxRecordKeys(); + try (HoodieFileReader reader = createNewFileReader()) { + return reader.readMinMaxRecordKeys(); + } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java index a771c33c40661..fee75b22decd7 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieReadHandle.java @@ -18,8 +18,11 @@ package org.apache.hudi.io; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.io.storage.HoodieFileReader; @@ -28,20 +31,17 @@ import java.io.IOException; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - /** * Base class for read operations done logically on the file group. */ public abstract class HoodieReadHandle extends HoodieIOHandle { - protected final Pair partitionPathFilePair; + protected final Pair partitionPathFileIDPair; - public HoodieReadHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable, - Pair partitionPathFilePair) { - super(config, instantTime, hoodieTable); - this.partitionPathFilePair = partitionPathFilePair; + public HoodieReadHandle(HoodieWriteConfig config, HoodieTable hoodieTable, + Pair partitionPathFileIDPair) { + super(config, Option.empty(), hoodieTable); + this.partitionPathFileIDPair = partitionPathFileIDPair; } @Override @@ -49,17 +49,17 @@ protected FileSystem getFileSystem() { return hoodieTable.getMetaClient().getFs(); } - public Pair getPartitionPathFilePair() { - return partitionPathFilePair; + public Pair getPartitionPathFileIDPair() { + return partitionPathFileIDPair; } public String getFileId() { - return partitionPathFilePair.getRight(); + return partitionPathFileIDPair.getRight(); } protected HoodieBaseFile getLatestDataFile() { return hoodieTable.getBaseFileOnlyView() - .getLatestBaseFile(partitionPathFilePair.getLeft(), partitionPathFilePair.getRight()).get(); + .getLatestBaseFile(partitionPathFileIDPair.getLeft(), partitionPathFileIDPair.getRight()).get(); } protected HoodieFileReader createNewFileReader() throws IOException { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java index 533611df2b765..897491b906aae 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieSortedMergeHandle.java @@ -85,7 +85,7 @@ public void write(GenericRecord oldRecord) { } // This is a new insert - HoodieRecord hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(keyToPreWrite)); + HoodieRecord hoodieRecord = keyToNewRecords.get(keyToPreWrite).newInstance(); if (writtenRecordKeys.contains(keyToPreWrite)) { throw new HoodieUpsertException("Insert/Update not in sorted order"); } @@ -108,8 +108,9 @@ public void write(GenericRecord oldRecord) { @Override public List close() { // write out any pending records (this can happen when inserts are turned into updates) - newRecordKeysSorted.stream().forEach(key -> { + while (!newRecordKeysSorted.isEmpty()) { try { + String key = newRecordKeysSorted.poll(); HoodieRecord hoodieRecord = keyToNewRecords.get(key); if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) { if (useWriterSchema) { @@ -122,7 +123,7 @@ public List close() { } catch (IOException e) { throw new HoodieUpsertException("Failed to close UpdateHandle", e); } - }); + } newRecordKeysSorted.clear(); keyToNewRecords.clear(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java index 37721611e2c9a..28e88e16a6482 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java @@ -108,7 +108,7 @@ public HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String pa protected HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String partitionPath, String fileId, HoodieTable hoodieTable, Option overriddenSchema, TaskContextSupplier taskContextSupplier) { - super(config, instantTime, hoodieTable); + super(config, Option.of(instantTime), hoodieTable); this.partitionPath = partitionPath; this.fileId = fileId; this.tableSchema = overriddenSchema.orElseGet(() -> getSpecifiedTableSchema(config)); @@ -210,7 +210,7 @@ public void write(HoodieRecord record, Option insertValue) { * Perform the actual writing of the given record into the backing file. */ public void write(HoodieRecord record, Option avroRecord, Option exception) { - Option recordMetadata = record.getData().getMetadata(); + Option recordMetadata = ((HoodieRecordPayload) record.getData()).getMetadata(); if (exception.isPresent() && exception.get() instanceof Throwable) { // Not throwing exception from here, since we don't want to fail the entire job for a single record writeStatus.markFailure(record, exception.get(), recordMetadata); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java index 0b6afd4d28b92..38db1cde41226 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieFileWriterFactory.java @@ -87,7 +87,8 @@ private static HoodieFi BloomFilter filter = createBloomFilter(config); HoodieHFileConfig hfileConfig = new HoodieHFileConfig(hoodieTable.getHadoopConf(), config.getHFileCompressionAlgorithm(), config.getHFileBlockSize(), config.getHFileMaxFileSize(), - PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR); + HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, + filter, HFILE_COMPARATOR); return new HoodieHFileWriter<>(instantTime, path, hfileConfig, schema, taskContextSupplier, config.populateMetaFields()); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java index 7e4c519a8fafc..1079566b782f1 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileConfig.java @@ -43,9 +43,10 @@ public class HoodieHFileConfig { private final Configuration hadoopConf; private final BloomFilter bloomFilter; private final KeyValue.KVComparator hfileComparator; + private final String keyFieldName; public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compressionAlgorithm, int blockSize, - long maxFileSize, boolean prefetchBlocksOnOpen, boolean cacheDataInL1, + long maxFileSize, String keyFieldName, boolean prefetchBlocksOnOpen, boolean cacheDataInL1, boolean dropBehindCacheCompaction, BloomFilter bloomFilter, KeyValue.KVComparator hfileComparator) { this.hadoopConf = hadoopConf; this.compressionAlgorithm = compressionAlgorithm; @@ -56,6 +57,7 @@ public HoodieHFileConfig(Configuration hadoopConf, Compression.Algorithm compres this.dropBehindCacheCompaction = dropBehindCacheCompaction; this.bloomFilter = bloomFilter; this.hfileComparator = hfileComparator; + this.keyFieldName = keyFieldName; } public Configuration getHadoopConf() { @@ -97,4 +99,8 @@ public BloomFilter getBloomFilter() { public KeyValue.KVComparator getHfileComparator() { return hfileComparator; } + + public String getKeyFieldName() { + return keyFieldName; + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java index a719bcb8f334f..2ad6d7f9220b0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileWriter.java @@ -38,6 +38,8 @@ import org.apache.hadoop.hbase.io.hfile.HFileContext; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; import org.apache.hadoop.io.Writable; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import java.io.DataInput; import java.io.DataOutput; @@ -63,6 +65,8 @@ public class HoodieHFileWriter keyFieldSchema; private HFile.Writer writer; private String minRecordKey; private String maxRecordKey; @@ -77,6 +81,8 @@ public HoodieHFileWriter(String instantTime, Path file, HoodieHFileConfig hfileC this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, conf); this.fs = (HoodieWrapperFileSystem) this.file.getFileSystem(conf); this.hfileConfig = hfileConfig; + this.schema = schema; + this.keyFieldSchema = Option.ofNullable(schema.getField(hfileConfig.getKeyFieldName())); // TODO - compute this compression ratio dynamically by looking at the bytes written to the // stream and the actual file size reported by HDFS @@ -121,8 +127,25 @@ public boolean canWrite() { } @Override - public void writeAvro(String recordKey, IndexedRecord object) throws IOException { - byte[] value = HoodieAvroUtils.avroToBytes((GenericRecord)object); + public void writeAvro(String recordKey, IndexedRecord record) throws IOException { + byte[] value = null; + boolean isRecordSerialized = false; + if (keyFieldSchema.isPresent()) { + GenericRecord keyExcludedRecord = (GenericRecord) record; + int keyFieldPos = this.keyFieldSchema.get().pos(); + boolean isKeyAvailable = (record.get(keyFieldPos) != null && !(record.get(keyFieldPos).toString().isEmpty())); + if (isKeyAvailable) { + Object originalKey = keyExcludedRecord.get(keyFieldPos); + keyExcludedRecord.put(keyFieldPos, StringUtils.EMPTY_STRING); + value = HoodieAvroUtils.avroToBytes(keyExcludedRecord); + keyExcludedRecord.put(keyFieldPos, originalKey); + isRecordSerialized = true; + } + } + if (!isRecordSerialized) { + value = HoodieAvroUtils.avroToBytes((GenericRecord) record); + } + KeyValue kv = new KeyValue(recordKey.getBytes(), null, null, value); writer.append(kv); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java index 4f51de35d24a9..3cee8c816d41f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetWriter.java @@ -51,13 +51,23 @@ public class HoodieParquetWriter !s.isEmpty()).collect(Collectors.toList()); - this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()) - .split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); + this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()).split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); } @Override diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java index d9de544d29b29..f1e41296f1dd3 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java @@ -29,7 +29,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieKeyException; import org.apache.hudi.exception.HoodieNotSupportedException; -import org.apache.hudi.keygen.parser.AbstractHoodieDateTimeParser; +import org.apache.hudi.keygen.parser.BaseHoodieDateTimeParser; import java.io.IOException; import java.util.Arrays; @@ -161,9 +161,9 @@ public static String getPartitionPath(GenericRecord record, String partitionPath /** * Create a date time parser class for TimestampBasedKeyGenerator, passing in any configs needed. */ - public static AbstractHoodieDateTimeParser createDateTimeParser(TypedProperties props, String parserClass) throws IOException { + public static BaseHoodieDateTimeParser createDateTimeParser(TypedProperties props, String parserClass) throws IOException { try { - return (AbstractHoodieDateTimeParser) ReflectionUtils.loadClass(parserClass, props); + return (BaseHoodieDateTimeParser) ReflectionUtils.loadClass(parserClass, props); } catch (Throwable e) { throw new IOException("Could not load date time parser class " + parserClass, e); } @@ -196,4 +196,4 @@ public static KeyGenerator createKeyGeneratorByClassName(TypedProperties props) } return keyGenerator; } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java index bc84ece503487..bce7e24c57a5f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/TimestampBasedAvroKeyGenerator.java @@ -26,8 +26,8 @@ import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; -import org.apache.hudi.keygen.parser.AbstractHoodieDateTimeParser; -import org.apache.hudi.keygen.parser.HoodieDateTimeParserImpl; +import org.apache.hudi.keygen.parser.BaseHoodieDateTimeParser; +import org.apache.hudi.keygen.parser.HoodieDateTimeParser; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; @@ -56,7 +56,7 @@ public enum TimestampType implements Serializable { private final String outputDateFormat; private transient Option inputFormatter; private transient DateTimeFormatter partitionFormatter; - private final AbstractHoodieDateTimeParser parser; + private final BaseHoodieDateTimeParser parser; // TimeZone detailed settings reference // https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html @@ -65,29 +65,6 @@ public enum TimestampType implements Serializable { protected final boolean encodePartitionPath; - /** - * Supported configs. - */ - public static class Config { - - // One value from TimestampType above - public static final String TIMESTAMP_TYPE_FIELD_PROP = "hoodie.deltastreamer.keygen.timebased.timestamp.type"; - public static final String INPUT_TIME_UNIT = - "hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit"; - //This prop can now accept list of input date formats. - public static final String TIMESTAMP_INPUT_DATE_FORMAT_PROP = - "hoodie.deltastreamer.keygen.timebased.input.dateformat"; - public static final String TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP = "hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex"; - public static final String TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.input.timezone"; - public static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP = - "hoodie.deltastreamer.keygen.timebased.output.dateformat"; - //still keeping this prop for backward compatibility so that functionality for existing users does not break. - public static final String TIMESTAMP_TIMEZONE_FORMAT_PROP = - "hoodie.deltastreamer.keygen.timebased.timezone"; - public static final String TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.output.timezone"; - static final String DATE_TIME_PARSER_PROP = "hoodie.deltastreamer.keygen.datetime.parser.class"; - } - public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException { this(config, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()), config.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); @@ -99,12 +76,12 @@ public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException TimestampBasedAvroKeyGenerator(TypedProperties config, String recordKeyField, String partitionPathField) throws IOException { super(config, recordKeyField, partitionPathField); - String dateTimeParserClass = config.getString(Config.DATE_TIME_PARSER_PROP, HoodieDateTimeParserImpl.class.getName()); + String dateTimeParserClass = config.getString(KeyGeneratorOptions.Config.DATE_TIME_PARSER_PROP, HoodieDateTimeParser.class.getName()); this.parser = KeyGenUtils.createDateTimeParser(config, dateTimeParserClass); this.inputDateTimeZone = parser.getInputDateTimeZone(); this.outputDateTimeZone = parser.getOutputDateTimeZone(); this.outputDateFormat = parser.getOutputDateFormat(); - this.timestampType = TimestampType.valueOf(config.getString(Config.TIMESTAMP_TYPE_FIELD_PROP)); + this.timestampType = TimestampType.valueOf(config.getString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP)); switch (this.timestampType) { case EPOCHMILLISECONDS: @@ -114,7 +91,7 @@ public TimestampBasedAvroKeyGenerator(TypedProperties config) throws IOException timeUnit = SECONDS; break; case SCALAR: - String timeUnitStr = config.getString(Config.INPUT_TIME_UNIT, TimeUnit.SECONDS.toString()); + String timeUnitStr = config.getString(KeyGeneratorOptions.Config.INPUT_TIME_UNIT, TimeUnit.SECONDS.toString()); timeUnit = TimeUnit.valueOf(timeUnitStr.toUpperCase()); break; default: @@ -148,7 +125,7 @@ public Object getDefaultPartitionVal() { // {Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP} won't be null, it has been checked in the initialization process of // inputFormatter String delimiter = parser.getConfigInputDateFormatDelimiter(); - String format = config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "").split(delimiter)[0]; + String format = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "").split(delimiter)[0]; // if both input and output timeZone are not configured, use GMT. if (null != inputDateTimeZone) { @@ -200,7 +177,7 @@ public String getPartitionPath(Object partitionVal) { timeMs = convertLongTimeToMillis(((BigDecimal) partitionVal).longValue()); } else if (partitionVal instanceof CharSequence) { if (!inputFormatter.isPresent()) { - throw new HoodieException("Missing inputformatter. Ensure " + Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " config is set when timestampType is DATE_STRING or MIXED!"); + throw new HoodieException("Missing inputformatter. Ensure " + KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " config is set when timestampType is DATE_STRING or MIXED!"); } DateTime parsedDateTime = inputFormatter.get().parseDateTime(partitionVal.toString()); if (this.outputDateTimeZone == null) { @@ -224,7 +201,7 @@ public String getPartitionPath(Object partitionVal) { private long convertLongTimeToMillis(Long partitionVal) { if (timeUnit == null) { // should not be possible - throw new RuntimeException(Config.INPUT_TIME_UNIT + " is not specified but scalar it supplied as time value"); + throw new RuntimeException(KeyGeneratorOptions.Config.INPUT_TIME_UNIT + " is not specified but scalar it supplied as time value"); } return MILLISECONDS.convert(partitionVal, timeUnit); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/AbstractHoodieDateTimeParser.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/BaseHoodieDateTimeParser.java similarity index 84% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/AbstractHoodieDateTimeParser.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/BaseHoodieDateTimeParser.java index 6fb05c30be11a..74c62fc63f537 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/AbstractHoodieDateTimeParser.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/BaseHoodieDateTimeParser.java @@ -19,24 +19,24 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; -import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.Config; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormatter; import java.io.Serializable; -public abstract class AbstractHoodieDateTimeParser implements Serializable { +public abstract class BaseHoodieDateTimeParser implements Serializable { protected final TypedProperties config; protected final String configInputDateFormatDelimiter; - public AbstractHoodieDateTimeParser(TypedProperties config) { + public BaseHoodieDateTimeParser(TypedProperties config) { this.config = config; this.configInputDateFormatDelimiter = initInputDateFormatDelimiter(); } private String initInputDateFormatDelimiter() { - String inputDateFormatDelimiter = config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, ",").trim(); + String inputDateFormatDelimiter = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, ",").trim(); inputDateFormatDelimiter = inputDateFormatDelimiter.isEmpty() ? "," : inputDateFormatDelimiter; return inputDateFormatDelimiter; } @@ -45,7 +45,7 @@ private String initInputDateFormatDelimiter() { * Returns the output date format in which the partition paths will be created for the hudi dataset. */ public String getOutputDateFormat() { - return config.getString(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); + return config.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); } /** diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParserImpl.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParser.java similarity index 68% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParserImpl.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParser.java index 81960ea168391..c15d484df7a53 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParserImpl.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/parser/HoodieDateTimeParser.java @@ -20,8 +20,8 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.TimestampType; -import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.Config; import org.apache.hudi.keygen.KeyGenUtils; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; @@ -32,7 +32,7 @@ import java.util.Collections; import java.util.TimeZone; -public class HoodieDateTimeParserImpl extends AbstractHoodieDateTimeParser { +public class HoodieDateTimeParser extends BaseHoodieDateTimeParser { private String configInputDateFormatList; @@ -40,15 +40,15 @@ public class HoodieDateTimeParserImpl extends AbstractHoodieDateTimeParser { // https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html private final DateTimeZone inputDateTimeZone; - public HoodieDateTimeParserImpl(TypedProperties config) { + public HoodieDateTimeParser(TypedProperties config) { super(config); - KeyGenUtils.checkRequiredProperties(config, Arrays.asList(Config.TIMESTAMP_TYPE_FIELD_PROP, Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP)); + KeyGenUtils.checkRequiredProperties(config, Arrays.asList(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP)); this.inputDateTimeZone = getInputDateTimeZone(); } private DateTimeFormatter getInputDateFormatter() { if (this.configInputDateFormatList.isEmpty()) { - throw new IllegalArgumentException(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " configuration is required"); + throw new IllegalArgumentException(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP + " configuration is required"); } DateTimeFormatter formatter = new DateTimeFormatterBuilder() @@ -72,16 +72,16 @@ private DateTimeFormatter getInputDateFormatter() { @Override public String getOutputDateFormat() { - return config.getString(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); + return config.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP); } @Override public Option getInputFormatter() { - TimestampType timestampType = TimestampType.valueOf(config.getString(Config.TIMESTAMP_TYPE_FIELD_PROP)); + TimestampType timestampType = TimestampType.valueOf(config.getString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP)); if (timestampType == TimestampType.DATE_STRING || timestampType == TimestampType.MIXED) { KeyGenUtils.checkRequiredProperties(config, - Collections.singletonList(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); - this.configInputDateFormatList = config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, ""); + Collections.singletonList(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP)); + this.configInputDateFormatList = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, ""); return Option.of(getInputDateFormatter()); } @@ -91,10 +91,10 @@ public Option getInputFormatter() { @Override public DateTimeZone getInputDateTimeZone() { String inputTimeZone; - if (config.containsKey(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP)) { - inputTimeZone = config.getString(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT"); + if (config.containsKey(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP)) { + inputTimeZone = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT"); } else { - inputTimeZone = config.getString(Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, ""); + inputTimeZone = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, ""); } return !inputTimeZone.trim().isEmpty() ? DateTimeZone.forTimeZone(TimeZone.getTimeZone(inputTimeZone)) : null; } @@ -102,10 +102,10 @@ public DateTimeZone getInputDateTimeZone() { @Override public DateTimeZone getOutputDateTimeZone() { String outputTimeZone; - if (config.containsKey(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP)) { - outputTimeZone = config.getString(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT"); + if (config.containsKey(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP)) { + outputTimeZone = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT"); } else { - outputTimeZone = config.getString(Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, ""); + outputTimeZone = config.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, ""); } return !outputTimeZone.trim().isEmpty() ? DateTimeZone.forTimeZone(TimeZone.getTimeZone(outputTimeZone)) : null; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java index 32f05cbad870c..eee676822a8aa 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java @@ -24,13 +24,14 @@ import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; @@ -39,6 +40,7 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.table.HoodieTableConfig; @@ -50,6 +52,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; @@ -96,7 +99,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta // Virtual keys support for metadata table. This Field is // from the metadata payload schema. - private static final String RECORD_KEY_FIELD = HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY; + private static final String RECORD_KEY_FIELD_NAME = HoodieMetadataPayload.KEY_FIELD_NAME; protected HoodieWriteConfig metadataWriteConfig; protected HoodieWriteConfig dataWriteConfig; @@ -109,6 +112,8 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta protected boolean enabled; protected SerializableConfiguration hadoopConf; protected final transient HoodieEngineContext engineContext; + // TODO: HUDI-3258 Support secondary key via multiple partitions within a single type + protected final List enabledPartitionTypes; /** * Hudi backed table metadata writer. @@ -128,6 +133,8 @@ protected HoodieBackedTableMetadataWriter(Configu this.dataWriteConfig = writeConfig; this.engineContext = engineContext; this.hadoopConf = new SerializableConfiguration(hadoopConf); + this.metrics = Option.empty(); + this.enabledPartitionTypes = new ArrayList<>(); if (writeConfig.isMetadataTableEnabled()) { this.tableName = writeConfig.getTableName() + METADATA_TABLE_NAME_SUFFIX; @@ -145,22 +152,67 @@ protected HoodieBackedTableMetadataWriter(Configu ValidationUtils.checkArgument(!this.metadataWriteConfig.isMetadataTableEnabled(), "File listing cannot be used for Metadata Table"); - initRegistry(); this.dataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(dataWriteConfig.getBasePath()).build(); + enablePartitions(); + initRegistry(); initialize(engineContext, actionMetadata, inflightInstantTimestamp); initTableMetadata(); } else { enabled = false; - this.metrics = Option.empty(); } } public HoodieBackedTableMetadataWriter(Configuration hadoopConf, HoodieWriteConfig writeConfig, - HoodieEngineContext engineContext) { + HoodieEngineContext engineContext) { this(hadoopConf, writeConfig, engineContext, Option.empty(), Option.empty()); } + /** + * Enable metadata table partitions based on config. + */ + private void enablePartitions() { + final HoodieMetadataConfig metadataConfig = dataWriteConfig.getMetadataConfig(); + boolean isBootstrapCompleted; + Option metaClient = Option.empty(); + try { + isBootstrapCompleted = dataMetaClient.getFs().exists(new Path(metadataWriteConfig.getBasePath(), HoodieTableMetaClient.METAFOLDER_NAME)); + if (isBootstrapCompleted) { + metaClient = Option.of(HoodieTableMetaClient.builder().setConf(hadoopConf.get()) + .setBasePath(metadataWriteConfig.getBasePath()).build()); + } + } catch (IOException e) { + throw new HoodieException("Failed to enable metadata partitions!", e); + } + + Option fsView = Option.ofNullable( + metaClient.isPresent() ? HoodieTableMetadataUtil.getFileSystemView(metaClient.get()) : null); + enablePartition(MetadataPartitionType.FILES, metadataConfig, metaClient, fsView, isBootstrapCompleted); + if (metadataConfig.isBloomFilterIndexEnabled()) { + enablePartition(MetadataPartitionType.BLOOM_FILTERS, metadataConfig, metaClient, fsView, isBootstrapCompleted); + } + if (metadataConfig.isColumnStatsIndexEnabled()) { + enablePartition(MetadataPartitionType.COLUMN_STATS, metadataConfig, metaClient, fsView, isBootstrapCompleted); + } + } + + /** + * Enable metadata table partition. + * + * @param partitionType - Metadata table partition type + * @param metadataConfig - Table config + * @param metaClient - Meta client for the metadata table + * @param fsView - Metadata table filesystem view to use + * @param isBootstrapCompleted - Is metadata table bootstrap completed + */ + private void enablePartition(final MetadataPartitionType partitionType, final HoodieMetadataConfig metadataConfig, + final Option metaClient, Option fsView, boolean isBootstrapCompleted) { + final int fileGroupCount = HoodieTableMetadataUtil.getPartitionFileGroupCount(partitionType, metaClient, fsView, + metadataConfig, isBootstrapCompleted); + partitionType.setFileGroupCount(fileGroupCount); + this.enabledPartitionTypes.add(partitionType); + } + protected abstract void initRegistry(); /** @@ -217,8 +269,8 @@ private HoodieWriteConfig createMetadataWriteConfig(HoodieWriteConfig writeConfi // RecordKey properties are needed for the metadata table records final Properties properties = new Properties(); - properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), RECORD_KEY_FIELD); - properties.put("hoodie.datasource.write.recordkey.field", RECORD_KEY_FIELD); + properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), RECORD_KEY_FIELD_NAME); + properties.put("hoodie.datasource.write.recordkey.field", RECORD_KEY_FIELD_NAME); builder.withProperties(properties); if (writeConfig.isMetricsOn()) { @@ -257,10 +309,14 @@ public HoodieWriteConfig getWriteConfig() { return metadataWriteConfig; } - public HoodieBackedTableMetadata metadata() { + public HoodieBackedTableMetadata getTableMetadata() { return metadata; } + public List getEnabledPartitionTypes() { + return this.enabledPartitionTypes; + } + /** * Initialize the metadata table if it does not exist. * @@ -454,13 +510,13 @@ private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, Hoodi .setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()) .setPayloadClassName(HoodieMetadataPayload.class.getName()) .setBaseFileFormat(HoodieFileFormat.HFILE.toString()) - .setRecordKeyFields(RECORD_KEY_FIELD) + .setRecordKeyFields(RECORD_KEY_FIELD_NAME) .setPopulateMetaFields(dataWriteConfig.getMetadataConfig().populateMetaFields()) .setKeyGeneratorClassProp(HoodieTableMetadataKeyGenerator.class.getCanonicalName()) .initTable(hadoopConf.get(), metadataWriteConfig.getBasePath()); initTableMetadata(); - initializeFileGroups(dataMetaClient, MetadataPartitionType.FILES, createInstantTime, 1); + initializeEnabledFileGroups(dataMetaClient, createInstantTime); // List all partitions in the basePath of the containing dataset LOG.info("Initializing metadata table by using file listings in " + dataWriteConfig.getBasePath()); @@ -529,13 +585,27 @@ private List listAllPartitions(HoodieTableMetaClient datasetMetaC return partitionsToBootstrap; } + /** + * Initialize file groups for all the enabled partition types. + * + * @param dataMetaClient - Meta client for the data table + * @param createInstantTime - Metadata table create instant time + * @throws IOException + */ + private void initializeEnabledFileGroups(HoodieTableMetaClient dataMetaClient, String createInstantTime) throws IOException { + for (MetadataPartitionType enabledPartitionType : this.enabledPartitionTypes) { + initializeFileGroups(dataMetaClient, enabledPartitionType, createInstantTime, + enabledPartitionType.getFileGroupCount()); + } + } + /** * Initialize file groups for a partition. For file listing, we just have one file group. * * All FileGroups for a given metadata partition has a fixed prefix as per the {@link MetadataPartitionType#getFileIdPrefix()}. * Each file group is suffixed with 4 digits with increments of 1 starting with 0000. * - * Lets say we configure 10 file groups for record level index partittion, and prefix as "record-index-bucket-" + * Lets say we configure 10 file groups for record level index partition, and prefix as "record-index-bucket-" * File groups will be named as : * record-index-bucket-0000, .... -> ..., record-index-bucket-0009 */ @@ -550,12 +620,12 @@ private void initializeFileGroups(HoodieTableMetaClient dataMetaClient, Metadata final HoodieDeleteBlock block = new HoodieDeleteBlock(new HoodieKey[0], blockHeader); LOG.info(String.format("Creating %d file groups for partition %s with base fileId %s at instant time %s", - fileGroupCount, metadataPartition.partitionPath(), metadataPartition.getFileIdPrefix(), instantTime)); + fileGroupCount, metadataPartition.getPartitionPath(), metadataPartition.getFileIdPrefix(), instantTime)); for (int i = 0; i < fileGroupCount; ++i) { final String fileGroupFileId = String.format("%s%04d", metadataPartition.getFileIdPrefix(), i); try { HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder() - .onParentPath(FSUtils.getPartitionPath(metadataWriteConfig.getBasePath(), metadataPartition.partitionPath())) + .onParentPath(FSUtils.getPartitionPath(metadataWriteConfig.getBasePath(), metadataPartition.getPartitionPath())) .withFileId(fileGroupFileId).overBaseCommit(instantTime) .withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION) .withFileSize(0L) @@ -567,7 +637,7 @@ private void initializeFileGroups(HoodieTableMetaClient dataMetaClient, Metadata writer.appendBlock(block); writer.close(); } catch (InterruptedException e) { - throw new HoodieException("Failed to created fileGroup " + fileGroupFileId + " for partition " + metadataPartition.partitionPath(), e); + throw new HoodieException("Failed to created fileGroup " + fileGroupFileId + " for partition " + metadataPartition.getPartitionPath(), e); } } } @@ -577,7 +647,7 @@ private void initializeFileGroups(HoodieTableMetaClient dataMetaClient, Metadata * Updates of different commit metadata uses the same method to convert to HoodieRecords and hence. */ private interface ConvertMetadataFunction { - List convertMetadata(); + Map> convertMetadata(); } /** @@ -589,8 +659,8 @@ private interface ConvertMetadataFunction { */ private void processAndCommit(String instantTime, ConvertMetadataFunction convertMetadataFunction, boolean canTriggerTableService) { if (enabled && metadata != null) { - List records = convertMetadataFunction.convertMetadata(); - commit(engineContext.parallelize(records, 1), MetadataPartitionType.FILES.partitionPath(), instantTime, canTriggerTableService); + Map> partitionRecordsMap = convertMetadataFunction.convertMetadata(); + commit(instantTime, partitionRecordsMap, canTriggerTableService); } } @@ -602,7 +672,8 @@ private void processAndCommit(String instantTime, ConvertMetadataFunction co */ @Override public void update(HoodieCommitMetadata commitMetadata, String instantTime, boolean isTableServiceAction) { - processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(commitMetadata, instantTime), !isTableServiceAction); + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, enabledPartitionTypes, + commitMetadata, dataMetaClient, dataWriteConfig.isMetadataIndexColumnStatsForAllColumnsEnabled(), instantTime), !isTableServiceAction); } /** @@ -613,8 +684,8 @@ public void update(HoodieCommitMetadata commitMetadata, String instantTime, bool */ @Override public void update(HoodieCleanMetadata cleanMetadata, String instantTime) { - processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(cleanMetadata, instantTime), - false); + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, enabledPartitionTypes, + cleanMetadata, dataMetaClient, instantTime), false); } /** @@ -625,8 +696,9 @@ public void update(HoodieCleanMetadata cleanMetadata, String instantTime) { */ @Override public void update(HoodieRestoreMetadata restoreMetadata, String instantTime) { - processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(metadataMetaClient.getActiveTimeline(), - restoreMetadata, instantTime, metadata.getSyncedInstantTime()), false); + processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, + enabledPartitionTypes, metadataMetaClient.getActiveTimeline(), restoreMetadata, dataMetaClient, instantTime, + metadata.getSyncedInstantTime()), false); } /** @@ -650,9 +722,11 @@ public void update(HoodieRollbackMetadata rollbackMetadata, String instantTime) } } - List records = HoodieTableMetadataUtil.convertMetadataToRecords(metadataMetaClient.getActiveTimeline(), rollbackMetadata, instantTime, - metadata.getSyncedInstantTime(), wasSynced); - commit(engineContext.parallelize(records, 1), MetadataPartitionType.FILES.partitionPath(), instantTime, false); + Map> records = + HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, enabledPartitionTypes, + metadataMetaClient.getActiveTimeline(), rollbackMetadata, dataMetaClient, instantTime, + metadata.getSyncedInstantTime(), wasSynced); + commit(instantTime, records, false); } } @@ -665,12 +739,47 @@ public void close() throws Exception { /** * Commit the {@code HoodieRecord}s to Metadata Table as a new delta-commit. - * @param records The HoodieData of records to be written. - * @param partitionName The partition to which the records are to be written. - * @param instantTime The timestamp to use for the deltacommit. + * + * @param instantTime - Action instant time for this commit + * @param partitionRecordsMap - Map of partition name to its records to commit * @param canTriggerTableService true if table services can be scheduled and executed. false otherwise. */ - protected abstract void commit(HoodieData records, String partitionName, String instantTime, boolean canTriggerTableService); + protected abstract void commit( + String instantTime, Map> partitionRecordsMap, + boolean canTriggerTableService); + + /** + * Tag each record with the location in the given partition. + * The record is tagged with respective file slice's location based on its record key. + */ + protected HoodieData prepRecords(Map> partitionRecordsMap) { + // The result set + HoodieData allPartitionRecords = engineContext.emptyHoodieData(); + + HoodieTableFileSystemView fsView = HoodieTableMetadataUtil.getFileSystemView(metadataMetaClient); + for (Map.Entry> entry : partitionRecordsMap.entrySet()) { + final String partitionName = entry.getKey().getPartitionPath(); + final int fileGroupCount = entry.getKey().getFileGroupCount(); + HoodieData records = entry.getValue(); + + List fileSlices = + HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient, Option.ofNullable(fsView), partitionName); + ValidationUtils.checkArgument(fileSlices.size() == fileGroupCount, + String.format("Invalid number of file groups for partition:%s, found=%d, required=%d", + partitionName, fileSlices.size(), fileGroupCount)); + + HoodieData rddSinglePartitionRecords = records.map(r -> { + FileSlice slice = fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), + fileGroupCount)); + r.setCurrentLocation(new HoodieRecordLocation(slice.getBaseInstantTime(), slice.getFileId())); + return r; + }); + + allPartitionRecords = allPartitionRecords.union(rddSinglePartitionRecords); + } + return allPartitionRecords; + } /** * Perform a compaction on the Metadata Table. @@ -682,7 +791,7 @@ public void close() throws Exception { * 2. In multi-writer scenario, a parallel operation with a greater instantTime may have completed creating a * deltacommit. */ - protected void compactIfNecessary(AbstractHoodieWriteClient writeClient, String instantTime) { + protected void compactIfNecessary(BaseHoodieWriteClient writeClient, String instantTime) { // finish off any pending compactions if any from previous attempt. writeClient.runAnyPendingCompactions(); @@ -706,7 +815,7 @@ protected void compactIfNecessary(AbstractHoodieWriteClient writeClient, String } } - protected void cleanIfNecessary(AbstractHoodieWriteClient writeClient, String instantTime) { + protected void cleanIfNecessary(BaseHoodieWriteClient writeClient, String instantTime) { Option lastCompletedCompactionInstant = metadataMetaClient.reloadActiveTimeline() .getCommitTimeline().filterCompletedInstants().lastInstant(); if (lastCompletedCompactionInstant.isPresent() @@ -735,14 +844,19 @@ protected void bootstrapCommit(List partitionInfoList, String cre List partitions = partitionInfoList.stream().map(p -> p.getRelativePath().isEmpty() ? NON_PARTITIONED_NAME : p.getRelativePath()).collect(Collectors.toList()); final int totalFiles = partitionInfoList.stream().mapToInt(p -> p.getTotalFiles()).sum(); + final Map> partitionToRecordsMap = new HashMap<>(); // Record which saves the list of all partitions HoodieRecord allPartitionRecord = HoodieMetadataPayload.createPartitionListRecord(partitions); if (partitions.isEmpty()) { - // in case of boostrapping of a fresh table, there won't be any partitions, but we need to make a boostrap commit - commit(engineContext.parallelize(Collections.singletonList(allPartitionRecord), 1), MetadataPartitionType.FILES.partitionPath(), createInstantTime, false); + // in case of bootstrapping of a fresh table, there won't be any partitions, but we need to make a boostrap commit + final HoodieData allPartitionRecordsRDD = engineContext.parallelize( + Collections.singletonList(allPartitionRecord), 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, allPartitionRecordsRDD); + commit(createInstantTime, partitionToRecordsMap, false); return; } + HoodieData partitionRecords = engineContext.parallelize(Arrays.asList(allPartitionRecord), 1); if (!partitionInfoList.isEmpty()) { HoodieData fileListRecords = engineContext.parallelize(partitionInfoList, partitionInfoList.size()).map(partitionInfo -> { @@ -762,7 +876,8 @@ protected void bootstrapCommit(List partitionInfoList, String cre LOG.info("Committing " + partitions.size() + " partitions and " + totalFiles + " files to metadata"); ValidationUtils.checkState(partitionRecords.count() == (partitions.size() + 1)); - commit(partitionRecords, MetadataPartitionType.FILES.partitionPath(), createInstantTime, false); + partitionToRecordsMap.put(MetadataPartitionType.FILES, partitionRecords); + commit(createInstantTime, partitionToRecordsMap, false); } /** diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataKeyGenerator.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataKeyGenerator.java index 4ec143bf06789..332be73b14f57 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataKeyGenerator.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataKeyGenerator.java @@ -42,7 +42,7 @@ public HoodieTableMetadataKeyGenerator(TypedProperties config) { @Override public String getRecordKey(GenericRecord record) { - return KeyGenUtils.getRecordKey(record, HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY, isConsistentLogicalTimestampEnabled()); + return KeyGenUtils.getRecordKey(record, HoodieMetadataPayload.KEY_FIELD_NAME, isConsistentLogicalTimestampEnabled()); } @Override diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java index e874047b8c644..d13110feef228 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/HoodieMetrics.java @@ -130,6 +130,26 @@ public Timer.Context getIndexCtx() { return indexTimer == null ? null : indexTimer.time(); } + public void updateMetricsForEmptyData(String actionType) { + if (!config.isMetricsOn() || !config.getMetricsReporterType().equals(MetricsReporterType.PROMETHEUS_PUSHGATEWAY)) { + // No-op if metrics are not of type PROMETHEUS_PUSHGATEWAY. + return; + } + Metrics.registerGauge(getMetricsName(actionType, "totalPartitionsWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalFilesInsert"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalFilesUpdate"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalRecordsWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalUpdateRecordsWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalInsertRecordsWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalBytesWritten"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalScanTime"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalCreateTime"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalUpsertTime"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalCompactedRecordsUpdated"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalLogFilesCompacted"), 0); + Metrics.registerGauge(getMetricsName(actionType, "totalLogFilesSize"), 0); + } + public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata, String actionType) { updateCommitTimingMetrics(commitEpochTimeInMs, durationInMs, metadata, actionType); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java index dc9e80431b8d7..d81e337b28d7a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/MetricsReporterFactory.java @@ -23,12 +23,12 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.metrics.cloudwatch.CloudWatchMetricsReporter; +import org.apache.hudi.metrics.custom.CustomizableMetricsReporter; import org.apache.hudi.metrics.datadog.DatadogMetricsReporter; - -import com.codahale.metrics.MetricRegistry; import org.apache.hudi.metrics.prometheus.PrometheusReporter; import org.apache.hudi.metrics.prometheus.PushGatewayMetricsReporter; -import org.apache.hudi.metrics.userdefined.AbstractUserDefinedMetricsReporter; + +import com.codahale.metrics.MetricRegistry; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -47,9 +47,9 @@ public static MetricsReporter createReporter(HoodieWriteConfig config, MetricReg if (!StringUtils.isNullOrEmpty(reporterClassName)) { Object instance = ReflectionUtils.loadClass( reporterClassName, new Class[] {Properties.class, MetricRegistry.class}, config.getProps(), registry); - if (!(instance instanceof AbstractUserDefinedMetricsReporter)) { + if (!(instance instanceof CustomizableMetricsReporter)) { throw new HoodieException(config.getMetricReporterClassName() - + " is not a subclass of AbstractUserDefinedMetricsReporter"); + + " is not a subclass of CustomizableMetricsReporter"); } return (MetricsReporter) instance; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/custom/CustomizableMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/custom/CustomizableMetricsReporter.java new file mode 100644 index 0000000000000..13574b1e15693 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/custom/CustomizableMetricsReporter.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics.custom; + +import org.apache.hudi.metrics.MetricsReporter; + +import com.codahale.metrics.MetricRegistry; + +import java.util.Properties; + +/** + * Extensible metrics reporter for custom implementation. + */ +public abstract class CustomizableMetricsReporter extends MetricsReporter { + private Properties props; + private MetricRegistry registry; + + public CustomizableMetricsReporter(Properties props, MetricRegistry registry) { + this.props = props; + this.registry = registry; + } + + public Properties getProps() { + return props; + } + + public MetricRegistry getRegistry() { + return registry; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/userdefined/AbstractUserDefinedMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/userdefined/AbstractUserDefinedMetricsReporter.java index 0a0d7bbe123a6..715b9564c5f70 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/userdefined/AbstractUserDefinedMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/userdefined/AbstractUserDefinedMetricsReporter.java @@ -7,38 +7,31 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package org.apache.hudi.metrics.userdefined; +import org.apache.hudi.metrics.custom.CustomizableMetricsReporter; + import com.codahale.metrics.MetricRegistry; -import org.apache.hudi.metrics.MetricsReporter; + import java.util.Properties; /** - * Abstract class of user defined metrics reporter. + * @deprecated Extend {@link CustomizableMetricsReporter} instead. */ -public abstract class AbstractUserDefinedMetricsReporter extends MetricsReporter { - private Properties props; - private MetricRegistry registry; +@Deprecated +public abstract class AbstractUserDefinedMetricsReporter extends CustomizableMetricsReporter { public AbstractUserDefinedMetricsReporter(Properties props, MetricRegistry registry) { - this.props = props; - this.registry = registry; - } - - public Properties getProps() { - return props; - } - - public MetricRegistry getRegistry() { - return registry; + super(props, registry); } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java index 40e3a316db107..bb4ae962038fe 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -18,13 +18,13 @@ package org.apache.hudi.table; -import org.apache.avro.specific.SpecificRecordBase; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCleanerPlan; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; @@ -44,7 +44,6 @@ import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; -import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -55,6 +54,7 @@ import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; +import org.apache.hudi.common.util.Functions; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; @@ -69,17 +69,19 @@ import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.marker.WriteMarkers; import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.table.storage.HoodieLayoutFactory; +import org.apache.hudi.table.storage.HoodieStorageLayout; import org.apache.avro.Schema; +import org.apache.avro.specific.SpecificRecordBase; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hudi.table.storage.HoodieLayoutFactory; -import org.apache.hudi.table.storage.HoodieStorageLayout; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import javax.annotation.Nonnull; + import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; @@ -104,7 +106,7 @@ public abstract class HoodieTable implem protected final HoodieWriteConfig config; protected final HoodieTableMetaClient metaClient; - protected final HoodieIndex index; + protected final HoodieIndex index; private SerializableConfiguration hadoopConfiguration; protected final TaskContextSupplier taskContextSupplier; private final HoodieTableMetadata metadata; @@ -130,7 +132,7 @@ protected HoodieTable(HoodieWriteConfig config, HoodieEngineContext context, Hoo this.taskContextSupplier = context.getTaskContextSupplier(); } - protected abstract HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context); + protected abstract HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext context); protected HoodieStorageLayout getStorageLayout(HoodieWriteConfig config) { return HoodieLayoutFactory.createLayout(config); @@ -143,6 +145,10 @@ private synchronized FileSystemViewManager getViewManager() { return viewManager; } + public HoodieTableMetadata getMetadata() { + return metadata; + } + /** * Upsert a batch of new records into Hoodie table at the supplied instantTime. * @param context HoodieEngineContext @@ -348,6 +354,13 @@ public HoodieTimeline getRollbackTimeline() { return getActiveTimeline().getRollbackTimeline(); } + /** + * Get restore timeline. + */ + public HoodieTimeline getRestoreTimeline() { + return getActiveTimeline().getRestoreTimeline(); + } + /** * Get only the completed (no-inflights) savepoint timeline. */ @@ -369,7 +382,7 @@ public HoodieActiveTimeline getActiveTimeline() { /** * Return the index. */ - public HoodieIndex getIndex() { + public HoodieIndex getIndex() { return index; } @@ -499,6 +512,13 @@ public abstract HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore); + /** + * Schedules Restore for the table to the given instant. + */ + public abstract Option scheduleRestore(HoodieEngineContext context, + String restoreInstantTime, + String instantToRestore); + /** * Rollback failed compactions. Inflight rollbacks for compactions revert the .inflight file * to the .requested file. @@ -681,9 +701,9 @@ private void validateSchema() throws HoodieUpsertException, HoodieInsertExceptio Schema writerSchema; boolean isValid; try { - TableSchemaResolver schemaUtil = new TableSchemaResolver(getMetaClient()); + TableSchemaResolver schemaResolver = new TableSchemaResolver(getMetaClient()); writerSchema = HoodieAvroUtils.createHoodieWriteSchema(config.getSchema()); - tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaUtil.getTableAvroSchemaWithoutMetadataFields()); + tableSchema = HoodieAvroUtils.createHoodieWriteSchema(schemaResolver.getTableAvroSchemaWithoutMetadataFields()); isValid = TableSchemaResolver.isSchemaCompatible(tableSchema, writerSchema); } catch (Exception e) { throw new HoodieException("Failed to read schema/check compatibility for base path " + metaClient.getBasePath(), e); @@ -719,19 +739,6 @@ public HoodieFileFormat getLogFileFormat() { return metaClient.getTableConfig().getLogFileFormat(); } - public HoodieLogBlockType getLogDataBlockFormat() { - switch (getBaseFileFormat()) { - case PARQUET: - case ORC: - return HoodieLogBlockType.AVRO_DATA_BLOCK; - case HFILE: - return HoodieLogBlockType.HFILE_DATA_BLOCK; - default: - throw new HoodieException("Base file format " + getBaseFileFormat() - + " does not have associated log block format"); - } - } - public String getBaseFileExtension() { return getBaseFileFormat().getFileExtension(); } @@ -776,11 +783,18 @@ public final Option getMetadataWriter(String triggeri * @param triggeringInstantTimestamp - The instant that is triggering this metadata write * @return instance of {@link HoodieTableMetadataWriter} */ - public Option getMetadataWriter(String triggeringInstantTimestamp, - Option actionMetadata) { + public Option getMetadataWriter(String triggeringInstantTimestamp, + Option actionMetadata) { // Each engine is expected to override this and // provide the actual metadata writer, if enabled. return Option.empty(); } + public HoodieTableMetadata getMetadataTable() { + return this.metadata; + } + + public Runnable getPreExecuteRunnable() { + return Functions.noop(); + } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadProfile.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadProfile.java index 7700e95d1d707..8e6160b095483 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadProfile.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadProfile.java @@ -33,9 +33,14 @@ public class WorkloadProfile implements Serializable { /** - * Computed workload profile. + * Computed workload stats. */ - protected final HashMap partitionPathStatMap; + protected final HashMap inputPartitionPathStatMap; + + /** + * Execution/Output workload stats + */ + protected final HashMap outputPartitionPathStatMap; /** * Global workloadStat. @@ -47,13 +52,21 @@ public class WorkloadProfile implements Serializable { */ private WriteOperationType operationType; + private final boolean hasOutputWorkLoadStats; + public WorkloadProfile(Pair, WorkloadStat> profile) { - this.partitionPathStatMap = profile.getLeft(); + this(profile, false); + } + + public WorkloadProfile(Pair, WorkloadStat> profile, boolean hasOutputWorkLoadStats) { + this.inputPartitionPathStatMap = profile.getLeft(); this.globalStat = profile.getRight(); + this.outputPartitionPathStatMap = new HashMap<>(); + this.hasOutputWorkLoadStats = hasOutputWorkLoadStats; } - public WorkloadProfile(Pair, WorkloadStat> profile, WriteOperationType operationType) { - this(profile); + public WorkloadProfile(Pair, WorkloadStat> profile, WriteOperationType operationType, boolean hasOutputWorkLoadStats) { + this(profile, hasOutputWorkLoadStats); this.operationType = operationType; } @@ -62,15 +75,37 @@ public WorkloadStat getGlobalStat() { } public Set getPartitionPaths() { - return partitionPathStatMap.keySet(); + return inputPartitionPathStatMap.keySet(); + } + + public Set getOutputPartitionPaths() { + return hasOutputWorkLoadStats ? outputPartitionPathStatMap.keySet() : inputPartitionPathStatMap.keySet(); } - public HashMap getPartitionPathStatMap() { - return partitionPathStatMap; + public HashMap getInputPartitionPathStatMap() { + return inputPartitionPathStatMap; + } + + public HashMap getOutputPartitionPathStatMap() { + return outputPartitionPathStatMap; + } + + public boolean hasOutputWorkLoadStats() { + return hasOutputWorkLoadStats; + } + + public void updateOutputPartitionPathStatMap(String partitionPath, WorkloadStat workloadStat) { + if (hasOutputWorkLoadStats) { + outputPartitionPathStatMap.put(partitionPath, workloadStat); + } } public WorkloadStat getWorkloadStat(String partitionPath) { - return partitionPathStatMap.get(partitionPath); + return inputPartitionPathStatMap.get(partitionPath); + } + + public WorkloadStat getOutputWorkloadStat(String partitionPath) { + return hasOutputWorkLoadStats ? outputPartitionPathStatMap.get(partitionPath) : inputPartitionPathStatMap.get(partitionPath); } public WriteOperationType getOperationType() { @@ -81,7 +116,8 @@ public WriteOperationType getOperationType() { public String toString() { final StringBuilder sb = new StringBuilder("WorkloadProfile {"); sb.append("globalStat=").append(globalStat).append(", "); - sb.append("partitionStat=").append(partitionPathStatMap).append(", "); + sb.append("InputPartitionStat=").append(inputPartitionPathStatMap).append(", "); + sb.append("OutputPartitionStat=").append(outputPartitionPathStatMap).append(", "); sb.append("operationType=").append(operationType); sb.append('}'); return sb.toString(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadStat.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadStat.java index c3371bab092db..327a5a3ae7980 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadStat.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/WorkloadStat.java @@ -33,9 +33,12 @@ public class WorkloadStat implements Serializable { private long numUpdates = 0L; + private HashMap> insertLocationToCount; + private HashMap> updateLocationToCount; public WorkloadStat() { + insertLocationToCount = new HashMap<>(); updateLocationToCount = new HashMap<>(); } @@ -43,6 +46,17 @@ public long addInserts(long numInserts) { return this.numInserts += numInserts; } + public long addInserts(HoodieRecordLocation location, long numInserts) { + long accNumInserts = 0; + if (insertLocationToCount.containsKey(location.getFileId())) { + accNumInserts = insertLocationToCount.get(location.getFileId()).getRight(); + } + insertLocationToCount.put( + location.getFileId(), + Pair.of(location.getInstantTime(), numInserts + accNumInserts)); + return this.numInserts += numInserts; + } + public long addUpdates(HoodieRecordLocation location, long numUpdates) { long accNumUpdates = 0; if (updateLocationToCount.containsKey(location.getFileId())) { @@ -66,6 +80,10 @@ public HashMap> getUpdateLocationToCount() { return updateLocationToCount; } + public HashMap> getInsertLocationToCount() { + return insertLocationToCount; + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("WorkloadStat {"); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java index 221f970cb5132..f893b4ccd5c4e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/BaseActionExecutor.java @@ -65,7 +65,7 @@ protected final void writeTableMetadata(HoodieCommitMetadata metadata, String ac * Writes clean metadata to table metadata. * @param metadata clean metadata of interest. */ - protected final void writeTableMetadata(HoodieCleanMetadata metadata) { + protected final void writeTableMetadata(HoodieCleanMetadata metadata, String instantTime) { table.getMetadataWriter(instantTime).ifPresent(w -> w.update(metadata, instantTime)); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapRecordConsumer.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapRecordConsumer.java index 4d3cd479de09d..8966a5d51c7cb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapRecordConsumer.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapRecordConsumer.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.action.bootstrap; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.HoodieBootstrapHandle; @@ -39,7 +40,8 @@ public BootstrapRecordConsumer(HoodieBootstrapHandle bootstrapHandle) { @Override protected void consumeOneRecord(HoodieRecord record) { try { - bootstrapHandle.write(record, record.getData().getInsertValue(bootstrapHandle.getWriterSchemaWithMetaFields())); + bootstrapHandle.write(record, ((HoodieRecordPayload) record.getData()) + .getInsertValue(bootstrapHandle.getWriterSchemaWithMetaFields())); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java index 9813b2b659677..4ae8009c9a88e 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanActionExecutor.java @@ -208,7 +208,7 @@ private HoodieCleanMetadata runClean(HoodieTable table, HoodieInstan if (!skipLocking) { this.txnManager.beginTransaction(Option.empty(), Option.empty()); } - writeTableMetadata(metadata); + writeTableMetadata(metadata, inflightInstant.getTimestamp()); table.getActiveTimeline().transitionCleanInflightToComplete(inflightInstant, TimelineMetadataUtils.serializeCleanMetadata(metadata)); LOG.info("Marked clean started on " + inflightInstant.getTimestamp() + " as complete"); @@ -240,9 +240,13 @@ public HoodieCleanMetadata execute() { LOG.warn("Failed to perform previous clean operation, instant: " + hoodieInstant, e); } } + table.getMetaClient().reloadActiveTimeline(); + if (config.isMetadataTableEnabled()) { + table.getHoodieView().sync(); + } }); - table.getMetaClient().reloadActiveTimeline(); } + // return the last clean metadata for now // TODO (NA) : Clean only the earliest pending clean just like how we do for other table services // This requires the CleanActionExecutor to be refactored as BaseCommitActionExecutor diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java index 27937af880c40..7e56d3456a0a4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java @@ -33,6 +33,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; @@ -50,8 +51,12 @@ import java.io.IOException; import java.io.Serializable; +import java.time.Instant; +import java.time.ZoneId; +import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.Collections; +import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -123,6 +128,7 @@ public Stream getSavepointedDataFiles(String savepointTime) { public List getPartitionPathsToClean(Option earliestRetainedInstant) throws IOException { switch (config.getCleanerPolicy()) { case KEEP_LATEST_COMMITS: + case KEEP_LATEST_BY_HOURS: return getPartitionPathsForCleanByCommits(earliestRetainedInstant); case KEEP_LATEST_FILE_VERSIONS: return getPartitionPathsForFullCleaning(); @@ -251,6 +257,10 @@ private List getFilesToCleanKeepingLatestVersions(String partitio return deletePaths; } + private List getFilesToCleanKeepingLatestCommits(String partitionPath) { + return getFilesToCleanKeepingLatestCommits(partitionPath, config.getCleanerCommitsRetained(), HoodieCleaningPolicy.KEEP_LATEST_COMMITS); + } + /** * Selects the versions for file for cleaning, such that it *

@@ -265,8 +275,7 @@ private List getFilesToCleanKeepingLatestVersions(String partitio *

* This policy is the default. */ - private List getFilesToCleanKeepingLatestCommits(String partitionPath) { - int commitsRetained = config.getCleanerCommitsRetained(); + private List getFilesToCleanKeepingLatestCommits(String partitionPath, int commitsRetained, HoodieCleaningPolicy policy) { LOG.info("Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. "); List deletePaths = new ArrayList<>(); @@ -303,14 +312,24 @@ private List getFilesToCleanKeepingLatestCommits(String partition // do not clean up a savepoint data file continue; } - // Dont delete the latest commit and also the last commit before the earliest commit we - // are retaining - // The window of commit retain == max query run time. So a query could be running which - // still - // uses this file. - if (fileCommitTime.equals(lastVersion) || (fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) { - // move on to the next file - continue; + + if (policy == HoodieCleaningPolicy.KEEP_LATEST_COMMITS) { + // Dont delete the latest commit and also the last commit before the earliest commit we + // are retaining + // The window of commit retain == max query run time. So a query could be running which + // still + // uses this file. + if (fileCommitTime.equals(lastVersion) || (fileCommitTime.equals(lastVersionBeforeEarliestCommitToRetain))) { + // move on to the next file + continue; + } + } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) { + // This block corresponds to KEEP_LATEST_BY_HOURS policy + // Do not delete the latest commit. + if (fileCommitTime.equals(lastVersion)) { + // move on to the next file + continue; + } } // Always keep the last commit @@ -334,6 +353,18 @@ private List getFilesToCleanKeepingLatestCommits(String partition } return deletePaths; } + + /** + * This method finds the files to be cleaned based on the number of hours. If {@code config.getCleanerHoursRetained()} is set to 5, + * all the files with commit time earlier than 5 hours will be removed. Also the latest file for any file group is retained. + * This policy gives much more flexibility to users for retaining data for running incremental queries as compared to + * KEEP_LATEST_COMMITS cleaning policy. The default number of hours is 5. + * @param partitionPath partition path to check + * @return list of files to clean + */ + private List getFilesToCleanKeepingLatestHours(String partitionPath) { + return getFilesToCleanKeepingLatestCommits(partitionPath, 0, HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS); + } private List getReplacedFilesEligibleToClean(List savepointedFiles, String partitionPath, Option earliestCommitToRetain) { final Stream replacedGroups; @@ -392,6 +423,8 @@ public List getDeletePaths(String partitionPath) { deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath); } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath); + } else if (policy == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) { + deletePaths = getFilesToCleanKeepingLatestHours(partitionPath); } else { throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name()); } @@ -406,9 +439,16 @@ public List getDeletePaths(String partitionPath) { public Option getEarliestCommitToRetain() { Option earliestCommitToRetain = Option.empty(); int commitsRetained = config.getCleanerCommitsRetained(); + int hoursRetained = config.getCleanerHoursRetained(); if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_COMMITS && commitTimeline.countInstants() > commitsRetained) { - earliestCommitToRetain = commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained); + earliestCommitToRetain = commitTimeline.nthInstant(commitTimeline.countInstants() - commitsRetained); //15 instants total, 10 commits to retain, this gives 6th instant in the list + } else if (config.getCleanerPolicy() == HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS) { + Instant instant = Instant.now(); + ZonedDateTime currentDateTime = ZonedDateTime.ofInstant(instant, ZoneId.systemDefault()); + String earliestTimeToRetain = HoodieActiveTimeline.formatDate(Date.from(currentDateTime.minusHours(hoursRetained).toInstant())); + earliestCommitToRetain = Option.fromJavaOptional(commitTimeline.getInstants().filter(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), + HoodieTimeline.GREATER_THAN_OR_EQUALS, earliestTimeToRetain)).findFirst()); } return earliestCommitToRetain; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/BaseClusteringPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java similarity index 93% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/BaseClusteringPlanActionExecutor.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java index a1820ed93b7ca..15ead5efb0080 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/BaseClusteringPlanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/ClusteringPlanActionExecutor.java @@ -41,13 +41,13 @@ import java.util.Collections; import java.util.Map; -public abstract class BaseClusteringPlanActionExecutor extends BaseActionExecutor> { +public class ClusteringPlanActionExecutor extends BaseActionExecutor> { - private static final Logger LOG = LogManager.getLogger(BaseClusteringPlanActionExecutor.class); + private static final Logger LOG = LogManager.getLogger(ClusteringPlanActionExecutor.class); private final Option> extraMetadata; - public BaseClusteringPlanActionExecutor(HoodieEngineContext context, + public ClusteringPlanActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractBulkInsertHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java similarity index 91% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractBulkInsertHelper.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java index 6e1ddeb72e0a6..dffd926aee3d5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractBulkInsertHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseBulkInsertHelper.java @@ -26,7 +26,7 @@ import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -public abstract class AbstractBulkInsertHelper { +public abstract class BaseBulkInsertHelper { /** * Mark instant as inflight, write input records, update index and return result. @@ -34,7 +34,7 @@ public abstract class AbstractBulkInsertHelper bulkInsert(I inputRecords, String instantTime, HoodieTable table, HoodieWriteConfig config, BaseCommitActionExecutor executor, boolean performDedupe, - Option> userDefinedBulkInsertPartitioner); + Option> userDefinedBulkInsertPartitioner); /** * Only write input records. Does not change timeline/index. Return information about new files created. @@ -42,7 +42,7 @@ public abstract HoodieWriteMetadata bulkInsert(I inputRecords, String instant public abstract O bulkInsert(I inputRecords, String instantTime, HoodieTable table, HoodieWriteConfig config, boolean performDedupe, - Option> userDefinedBulkInsertPartitioner, + Option> userDefinedBulkInsertPartitioner, boolean addMetadataFields, int parallelism, WriteHandleFactory writeHandleFactory); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java index 7449f3f8045a3..b8d5948c1f453 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseCommitActionExecutor.java @@ -18,12 +18,18 @@ package org.apache.hudi.table.action.commit; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieClusteringGroup; +import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.transaction.TransactionManager; import org.apache.hudi.client.utils.TransactionUtils; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat; @@ -32,10 +38,14 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.HoodieTable; @@ -43,7 +53,9 @@ import org.apache.hudi.table.WorkloadStat; import org.apache.hudi.table.action.BaseActionExecutor; import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; +import org.apache.avro.Schema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -54,6 +66,9 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; public abstract class BaseCommitActionExecutor extends BaseActionExecutor { @@ -73,7 +88,7 @@ public BaseCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig c this.operationType = operationType; this.extraMetadata = extraMetadata; this.taskContextSupplier = context.getTaskContextSupplier(); - // TODO : Remove this once we refactor and move out autoCommit method from here, since the TxnManager is held in {@link AbstractHoodieWriteClient}. + // TODO : Remove this once we refactor and move out autoCommit method from here, since the TxnManager is held in {@link BaseHoodieWriteClient}. this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); this.lastCompletedTxn = TransactionUtils.getLastCompletedTxnInstantAndMetadata(table.getMetaClient()); if (table.getStorageLayout().doesNotSupport(operationType)) { @@ -94,22 +109,32 @@ void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, String insta throws HoodieCommitException { try { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); - profile.getPartitionPaths().forEach(path -> { - WorkloadStat partitionStat = profile.getWorkloadStat(path); + profile.getOutputPartitionPaths().forEach(path -> { + WorkloadStat partitionStat = profile.getOutputWorkloadStat(path); HoodieWriteStat insertStat = new HoodieWriteStat(); insertStat.setNumInserts(partitionStat.getNumInserts()); insertStat.setFileId(""); insertStat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); metadata.addWriteStat(path, insertStat); - - partitionStat.getUpdateLocationToCount().forEach((key, value) -> { - HoodieWriteStat writeStat = new HoodieWriteStat(); - writeStat.setFileId(key); - // TODO : Write baseCommitTime is possible here ? - writeStat.setPrevCommit(value.getKey()); - writeStat.setNumUpdateWrites(value.getValue()); - metadata.addWriteStat(path, writeStat); - }); + Map> updateLocationMap = partitionStat.getUpdateLocationToCount(); + Map> insertLocationMap = partitionStat.getInsertLocationToCount(); + Stream.concat(updateLocationMap.keySet().stream(), insertLocationMap.keySet().stream()) + .distinct() + .forEach(fileId -> { + HoodieWriteStat writeStat = new HoodieWriteStat(); + writeStat.setFileId(fileId); + Pair updateLocation = updateLocationMap.get(fileId); + Pair insertLocation = insertLocationMap.get(fileId); + // TODO : Write baseCommitTime is possible here ? + writeStat.setPrevCommit(updateLocation != null ? updateLocation.getKey() : insertLocation.getKey()); + if (updateLocation != null) { + writeStat.setNumUpdateWrites(updateLocation.getValue()); + } + if (insertLocation != null) { + writeStat.setNumInserts(insertLocation.getValue()); + } + metadata.addWriteStat(path, writeStat); + }); }); metadata.setOperationType(operationType); @@ -152,18 +177,22 @@ protected void commitOnAutoCommit(HoodieWriteMetadata result) { protected void autoCommit(Option> extraMetadata, HoodieWriteMetadata result) { final Option inflightInstant = Option.of(new HoodieInstant(State.INFLIGHT, - HoodieTimeline.COMMIT_ACTION, instantTime)); + getCommitActionType(), instantTime)); this.txnManager.beginTransaction(inflightInstant, lastCompletedTxn.isPresent() ? Option.of(lastCompletedTxn.get().getLeft()) : Option.empty()); try { + setCommitMetadata(result); + // reload active timeline so as to get all updates after current transaction have started. hence setting last arg to true. TransactionUtils.resolveWriteConflictIfAny(table, this.txnManager.getCurrentTransactionOwner(), - result.getCommitMetadata(), config, this.txnManager.getLastCompletedTransactionOwner()); + result.getCommitMetadata(), config, this.txnManager.getLastCompletedTransactionOwner(), true); commit(extraMetadata, result); } finally { this.txnManager.endTransaction(inflightInstant); } } + protected abstract void setCommitMetadata(HoodieWriteMetadata result); + protected abstract void commit(Option> extraMetadata, HoodieWriteMetadata result); /** @@ -197,4 +226,65 @@ protected abstract Iterator> handleInsert(String idPfx, protected abstract Iterator> handleUpdate(String partitionPath, String fileId, Iterator> recordItr) throws IOException; + + protected HoodieWriteMetadata> executeClustering(HoodieClusteringPlan clusteringPlan) { + HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); + // Mark instant as clustering inflight + table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); + table.getMetaClient().reloadActiveTimeline(); + + final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); + HoodieWriteMetadata> writeMetadata = ( + (ClusteringExecutionStrategy>, HoodieData, HoodieData>) + ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(), + new Class[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config)) + .performClustering(clusteringPlan, schema, instantTime); + HoodieData writeStatusList = writeMetadata.getWriteStatuses(); + HoodieData statuses = updateIndex(writeStatusList, writeMetadata); + writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collectAsList()); + writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan, writeMetadata)); + validateWriteResult(clusteringPlan, writeMetadata); + commitOnAutoCommit(writeMetadata); + if (!writeMetadata.getCommitMetadata().isPresent()) { + HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), + extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + writeMetadata.setCommitMetadata(Option.of(commitMetadata)); + } + return writeMetadata; + } + + private HoodieData updateIndex(HoodieData writeStatuses, HoodieWriteMetadata> result) { + Instant indexStartTime = Instant.now(); + // Update the index back + HoodieData statuses = table.getIndex().updateLocation(writeStatuses, context, table); + result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now())); + result.setWriteStatuses(statuses); + return statuses; + } + + private Map> getPartitionToReplacedFileIds(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata> writeMetadata) { + Set newFilesWritten = writeMetadata.getWriteStats().get().stream() + .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); + + return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) + .filter(fg -> "org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy" + .equals(config.getClusteringExecutionStrategyClass()) + || !newFilesWritten.contains(fg)) + .collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath, Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList()))); + } + + /** + * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written. + * But we can extend this to add more validation. E.g. number of records read = number of records written etc. + * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions. + */ + private void validateWriteResult(HoodieClusteringPlan clusteringPlan, HoodieWriteMetadata> writeMetadata) { + if (writeMetadata.getWriteStatuses().isEmpty()) { + throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime + + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " + + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() + + " write statuses"); + } + } + } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractDeleteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseDeleteHelper.java similarity index 95% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractDeleteHelper.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseDeleteHelper.java index ac0f2596f490e..b119587f47535 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractDeleteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseDeleteHelper.java @@ -29,7 +29,7 @@ * * @param */ -public abstract class AbstractDeleteHelper { +public abstract class BaseDeleteHelper { /** * Deduplicate Hoodie records, using the given deduplication function. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractMergeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java similarity index 98% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractMergeHelper.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java index 59a3323bcb73e..5ead348140aa3 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractMergeHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseMergeHelper.java @@ -47,7 +47,7 @@ /** * Helper to read records from previous version of base file and run Merge. */ -public abstract class AbstractMergeHelper { +public abstract class BaseMergeHelper { /** * Read records from previous version of base file and merge. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java similarity index 95% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java index 3f241944c3af7..6d5372b47297d 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/BaseWriteHelper.java @@ -30,7 +30,7 @@ import java.time.Duration; import java.time.Instant; -public abstract class AbstractWriteHelper { +public abstract class BaseWriteHelper { public HoodieWriteMetadata write(String instantTime, I inputRecords, @@ -86,5 +86,5 @@ public I deduplicateRecords( } public abstract I deduplicateRecords( - I records, HoodieIndex index, int parallelism); + I records, HoodieIndex index, int parallelism); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java index 73e1413d9dde0..e238d40683b64 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/compact/HoodieCompactor.java @@ -111,13 +111,13 @@ public HoodieData compact( table.getMetaClient().reloadActiveTimeline(); HoodieTableMetaClient metaClient = table.getMetaClient(); - TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); // Here we firstly use the table schema as the reader schema to read // log file.That is because in the case of MergeInto, the config.getSchema may not // the same with the table schema. try { - Schema readerSchema = schemaUtil.getTableAvroSchema(false); + Schema readerSchema = schemaResolver.getTableAvroSchema(false); config.setSchema(readerSchema.toString()); } catch (Exception e) { // If there is no commit in the table, just ignore the exception. @@ -182,14 +182,30 @@ public List compact(HoodieCompactionHandler compactionHandler, .withOperationField(config.allowOperationMetadataField()) .withPartition(operation.getPartitionPath()) .build(); - if (!scanner.iterator().hasNext()) { - scanner.close(); - return new ArrayList<>(); - } Option oldDataFileOpt = operation.getBaseFile(metaClient.getBasePath(), operation.getPartitionPath()); + // Considering following scenario: if all log blocks in this fileSlice is rollback, it returns an empty scanner. + // But in this case, we need to give it a base file. Otherwise, it will lose base file in following fileSlice. + if (!scanner.iterator().hasNext()) { + if (!oldDataFileOpt.isPresent()) { + scanner.close(); + return new ArrayList<>(); + } else { + // TODO: we may directly rename original parquet file if there is not evolution/devolution of schema + /* + TaskContextSupplier taskContextSupplier = hoodieCopyOnWriteTable.getTaskContextSupplier(); + String newFileName = FSUtils.makeDataFileName(instantTime, + FSUtils.makeWriteToken(taskContextSupplier.getPartitionIdSupplier().get(), taskContextSupplier.getStageIdSupplier().get(), taskContextSupplier.getAttemptIdSupplier().get()), + operation.getFileId(), hoodieCopyOnWriteTable.getBaseFileExtension()); + Path oldFilePath = new Path(oldDataFileOpt.get().getPath()); + Path newFilePath = new Path(oldFilePath.getParent(), newFileName); + FileUtil.copy(fs,oldFilePath, fs, newFilePath, false, fs.getConf()); + */ + } + } + // Compacting is very similar to applying updates to existing file Iterator> result; // If the dataFile is present, perform updates else perform inserts into a new base file. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java index 58247bb8ea0be..9025623e86916 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/restore/BaseRestoreActionExecutor.java @@ -18,7 +18,9 @@ package org.apache.hudi.table.action.restore; +import org.apache.hudi.avro.model.HoodieInstantInfo; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.client.transaction.TransactionManager; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -29,14 +31,18 @@ import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieRestoreException; import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.BaseActionExecutor; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -65,27 +71,51 @@ public HoodieRestoreMetadata execute() { HoodieTimer restoreTimer = new HoodieTimer(); restoreTimer.startTimer(); - // Get all the commits on the timeline after the provided commit time - List instantsToRollback = table.getActiveTimeline().getWriteTimeline() - .getReverseOrderedInstants() - .filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)) - .collect(Collectors.toList()); + Option restoreInstant = table.getRestoreTimeline() + .filterInflightsAndRequested() + .filter(instant -> instant.getTimestamp().equals(instantTime)) + .firstInstant(); + if (!restoreInstant.isPresent()) { + throw new HoodieRollbackException("No pending restore instants found to execute restore"); + } + try { + List instantsToRollback = getInstantsToRollback(restoreInstant.get()); + ValidationUtils.checkArgument(restoreInstant.get().getState().equals(HoodieInstant.State.REQUESTED) + || restoreInstant.get().getState().equals(HoodieInstant.State.INFLIGHT)); + Map> instantToMetadata = new HashMap<>(); + if (restoreInstant.get().isRequested()) { + table.getActiveTimeline().transitionRestoreRequestedToInflight(restoreInstant.get()); + } - Map> instantToMetadata = new HashMap<>(); - table.getActiveTimeline().createNewInstant(new HoodieInstant(true, HoodieTimeline.RESTORE_ACTION, instantTime)); - instantsToRollback.forEach(instant -> { - instantToMetadata.put(instant.getTimestamp(), Collections.singletonList(rollbackInstant(instant))); - LOG.info("Deleted instant " + instant); - }); + instantsToRollback.forEach(instant -> { + instantToMetadata.put(instant.getTimestamp(), Collections.singletonList(rollbackInstant(instant))); + LOG.info("Deleted instant " + instant); + }); - try { return finishRestore(instantToMetadata, instantsToRollback, restoreTimer.endTimer() ); } catch (IOException io) { - throw new HoodieRollbackException("unable to rollback instants " + instantsToRollback, io); + throw new HoodieRestoreException("unable to Restore instant " + restoreInstant.get(), io); + } + } + + private List getInstantsToRollback(HoodieInstant restoreInstant) throws IOException { + List instantsToRollback = new ArrayList<>(); + HoodieRestorePlan restorePlan = RestoreUtils.getRestorePlan(table.getMetaClient(), restoreInstant); + for (HoodieInstantInfo instantInfo : restorePlan.getInstantsToRollback()) { + // If restore crashed mid-way, there are chances that some commits are already rolled back, + // but some are not. so, we can ignore those commits which are fully rolledback in previous attempt if any. + Option rollbackInstantOpt = table.getActiveTimeline().getWriteTimeline() + .filter(instant -> instant.getTimestamp().equals(instantInfo.getCommitTime()) && instant.getAction().equals(instantInfo.getAction())).firstInstant(); + if (rollbackInstantOpt.isPresent()) { + instantsToRollback.add(rollbackInstantOpt.get()); + } else { + LOG.warn("Ignoring already rolledback instant " + instantInfo.toString()); + } } + return instantsToRollback; } protected abstract HoodieRollbackMetadata rollbackInstant(HoodieInstant rollbackInstant); @@ -99,7 +129,7 @@ private HoodieRestoreMetadata finishRestore(Map instantsToRollback = table.getActiveTimeline().getRollbackTimeline() .getReverseOrderedInstants() @@ -115,6 +145,7 @@ private HoodieRestoreMetadata finishRestore(Map> maybeDeleteAndCollectStats(HoodieEngineCo rollbackStats.forEach(entry -> partitionToRollbackStats.add(Pair.of(entry.getPartitionPath(), entry))); return partitionToRollbackStats.stream(); } else if (!rollbackRequest.getLogBlocksToBeDeleted().isEmpty()) { - Map logFilesToBeDeleted = rollbackRequest.getLogBlocksToBeDeleted(); - String fileId = rollbackRequest.getFileId(); - String latestBaseInstant = rollbackRequest.getLatestBaseInstant(); - FileSystem fs = metaClient.getFs(); - // collect all log files that is supposed to be deleted with this rollback - // what happens if file was deleted when invoking fs.getFileStatus(?) below. - // I understand we don't delete log files. but just curious if we need to handle this case. - Map writtenLogFileSizeMap = new HashMap<>(); - for (Map.Entry entry : logFilesToBeDeleted.entrySet()) { - writtenLogFileSizeMap.put(fs.getFileStatus(new Path(entry.getKey())), entry.getValue()); - } HoodieLogFormat.Writer writer = null; try { + String fileId = rollbackRequest.getFileId(); + String latestBaseInstant = rollbackRequest.getLatestBaseInstant(); + writer = HoodieLogFormat.newWriterBuilder() .onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath())) .withFileId(fileId) @@ -156,7 +146,7 @@ List> maybeDeleteAndCollectStats(HoodieEngineCo writer.close(); } } catch (IOException io) { - throw new HoodieIOException("Error appending rollback block..", io); + throw new HoodieIOException("Error appending rollback block", io); } } @@ -167,15 +157,21 @@ List> maybeDeleteAndCollectStats(HoodieEngineCo metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()), 1L ); - return Collections.singletonList(Pair.of(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .withRollbackBlockAppendResults(filesToNumBlocksRollback) - .withWrittenLogFileSizeMap(writtenLogFileSizeMap).build())).stream(); + + return Collections.singletonList( + Pair.of(rollbackRequest.getPartitionPath(), + HoodieRollbackStat.newBuilder() + .withPartitionPath(rollbackRequest.getPartitionPath()) + .withRollbackBlockAppendResults(filesToNumBlocksRollback) + .build())) + .stream(); } else { - return Collections - .singletonList(Pair.of(rollbackRequest.getPartitionPath(), - HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) - .build())).stream(); + return Collections.singletonList( + Pair.of(rollbackRequest.getPartitionPath(), + HoodieRollbackStat.newBuilder() + .withPartitionPath(rollbackRequest.getPartitionPath()) + .build())) + .stream(); } }, numPartitions); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java index b47136fa02a58..628b2fc3720f8 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java @@ -19,18 +19,17 @@ package org.apache.hudi.table.action.rollback; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; import org.apache.hudi.avro.model.HoodieRollbackRequest; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.HoodieFileFormat; -import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.config.HoodieWriteConfig; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.PathFilter; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -38,7 +37,6 @@ import java.io.Serializable; import java.util.Arrays; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -104,22 +102,20 @@ private List getListingBasedRollbackRequests(HoodieEngine case APPEND_ROLLBACK_BLOCK: { String fileId = rollbackRequest.getFileId().get(); String latestBaseInstant = rollbackRequest.getLatestBaseInstant().get(); - // collect all log files that is supposed to be deleted with this rollback - Map writtenLogFileSizeMap = FSUtils.getAllLogFiles(metaClient.getFs(), - FSUtils.getPartitionPath(config.getBasePath(), rollbackRequest.getPartitionPath()), - fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), latestBaseInstant) - .collect(Collectors.toMap(HoodieLogFile::getFileStatus, value -> value.getFileStatus().getLen())); - Map logFilesToBeDeleted = new HashMap<>(); - for (Map.Entry fileToBeDeleted : writtenLogFileSizeMap.entrySet()) { - logFilesToBeDeleted.put(fileToBeDeleted.getKey().getPath().toString(), fileToBeDeleted.getValue()); - } + HoodieWriteStat writeStat = rollbackRequest.getWriteStat().get(); + + Path fullLogFilePath = FSUtils.getPartitionPath(config.getBasePath(), writeStat.getPath()); + + Map logFilesWithBlocksToRollback = + Collections.singletonMap(fullLogFilePath.toString(), writeStat.getTotalWriteBytes()); + return new HoodieRollbackRequest(rollbackRequest.getPartitionPath(), fileId, latestBaseInstant, - Collections.EMPTY_LIST, logFilesToBeDeleted); + Collections.EMPTY_LIST, logFilesWithBlocksToRollback); } default: throw new IllegalStateException("Unknown Rollback action " + rollbackRequest); } - }, numPartitions).stream().collect(Collectors.toList()); + }, numPartitions); } private FileStatus[] getBaseFilesToBeDeleted(HoodieTableMetaClient metaClient, HoodieWriteConfig config, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java index fc369a46711cf..7411231bb7d79 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java @@ -18,12 +18,15 @@ package org.apache.hudi.table.action.rollback; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.util.Option; +import java.io.Serializable; + /** * Request for performing one rollback action. */ -public class ListingBasedRollbackRequest { +public class ListingBasedRollbackRequest implements Serializable { /** * Rollback commands, that trigger a specific handling for rollback. @@ -49,32 +52,42 @@ public enum Type { */ private final Option latestBaseInstant; + /** + * TODO + */ + private final Option writeStat; + private final Type type; + public ListingBasedRollbackRequest(String partitionPath, Type type) { + this(partitionPath, Option.empty(), Option.empty(), Option.empty(), type); + } + public ListingBasedRollbackRequest(String partitionPath, Option fileId, Option latestBaseInstant, + Option writeStat, Type type) { this.partitionPath = partitionPath; this.fileId = fileId; this.latestBaseInstant = latestBaseInstant; + this.writeStat = writeStat; this.type = type; } public static ListingBasedRollbackRequest createRollbackRequestWithDeleteDataFilesOnlyAction(String partitionPath) { - return new ListingBasedRollbackRequest(partitionPath, Option.empty(), Option.empty(), - Type.DELETE_DATA_FILES_ONLY); + return new ListingBasedRollbackRequest(partitionPath, Type.DELETE_DATA_FILES_ONLY); } public static ListingBasedRollbackRequest createRollbackRequestWithDeleteDataAndLogFilesAction(String partitionPath) { - return new ListingBasedRollbackRequest(partitionPath, Option.empty(), Option.empty(), - Type.DELETE_DATA_AND_LOG_FILES); + return new ListingBasedRollbackRequest(partitionPath, Type.DELETE_DATA_AND_LOG_FILES); } - public static ListingBasedRollbackRequest createRollbackRequestWithAppendRollbackBlockAction(String partitionPath, String fileId, - String baseInstant) { - return new ListingBasedRollbackRequest(partitionPath, Option.of(fileId), Option.of(baseInstant), - Type.APPEND_ROLLBACK_BLOCK); + public static ListingBasedRollbackRequest createRollbackRequestWithAppendRollbackBlockAction(String partitionPath, + String fileId, + String baseInstant, + HoodieWriteStat writeStat) { + return new ListingBasedRollbackRequest(partitionPath, Option.of(fileId), Option.of(baseInstant), Option.of(writeStat), Type.APPEND_ROLLBACK_BLOCK); } public String getPartitionPath() { @@ -89,6 +102,10 @@ public Option getLatestBaseInstant() { return latestBaseInstant; } + public Option getWriteStat() { + return writeStat; + } + public Type getType() { return type; } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java index 9d04e3036f204..e7a4170ec7871 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.action.rollback; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.model.HoodieRollbackRequest; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; @@ -31,18 +32,13 @@ import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.marker.MarkerBasedRollbackUtils; import org.apache.hudi.table.marker.WriteMarkers; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; import static org.apache.hudi.table.action.rollback.BaseRollbackHelper.EMPTY_STRING; @@ -90,42 +86,41 @@ public List getRollbackRequests(HoodieInstant instantToRo Collections.singletonList(fullDeletePath.toString()), Collections.emptyMap()); case APPEND: + // NOTE: This marker file-path does NOT correspond to a log-file, but rather is a phony + // path serving as a "container" for the following components: + // - Base file's file-id + // - Base file's commit instant + // - Partition path return getRollbackRequestForAppend(WriteMarkers.stripMarkerSuffix(markerFilePath)); default: throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback); } - }, parallelism).stream().collect(Collectors.toList()); + }, parallelism); } catch (Exception e) { throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e); } } - protected HoodieRollbackRequest getRollbackRequestForAppend(String appendBaseFilePath) throws IOException { - Path baseFilePathForAppend = new Path(basePath, appendBaseFilePath); + protected HoodieRollbackRequest getRollbackRequestForAppend(String markerFilePath) throws IOException { + Path baseFilePathForAppend = new Path(basePath, markerFilePath); String fileId = FSUtils.getFileIdFromFilePath(baseFilePathForAppend); String baseCommitTime = FSUtils.getCommitTime(baseFilePathForAppend.getName()); - String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), new Path(basePath, appendBaseFilePath).getParent()); - Map writtenLogFileSizeMap = getWrittenLogFileSizeMap(partitionPath, baseCommitTime, fileId); - Map writtenLogFileStrSizeMap = new HashMap<>(); - for (Map.Entry entry : writtenLogFileSizeMap.entrySet()) { - writtenLogFileStrSizeMap.put(entry.getKey().getPath().toString(), entry.getValue()); - } - return new HoodieRollbackRequest(partitionPath, fileId, baseCommitTime, Collections.emptyList(), writtenLogFileStrSizeMap); + String relativePartitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), baseFilePathForAppend.getParent()); + Path partitionPath = FSUtils.getPartitionPath(config.getBasePath(), relativePartitionPath); + + // NOTE: Since we're rolling back incomplete Delta Commit, it only could have appended its + // block to the latest log-file + // TODO(HUDI-1517) use provided marker-file's path instead + HoodieLogFile latestLogFile = FSUtils.getLatestLogFile(table.getMetaClient().getFs(), partitionPath, fileId, + HoodieFileFormat.HOODIE_LOG.getFileExtension(), baseCommitTime).get(); + + // NOTE: Marker's don't carry information about the cumulative size of the blocks that have been appended, + // therefore we simply stub this value. + Map logFilesWithBlocsToRollback = + Collections.singletonMap(latestLogFile.getFileStatus().getPath().toString(), -1L); + + return new HoodieRollbackRequest(relativePartitionPath, fileId, baseCommitTime, Collections.emptyList(), + logFilesWithBlocsToRollback); } - /** - * Returns written log file size map for the respective baseCommitTime to assist in metadata table syncing. - * - * @param partitionPathStr partition path of interest - * @param baseCommitTime base commit time of interest - * @param fileId fileId of interest - * @return Map - * @throws IOException - */ - private Map getWrittenLogFileSizeMap(String partitionPathStr, String baseCommitTime, String fileId) throws IOException { - // collect all log files that is supposed to be deleted with this rollback - return FSUtils.getAllLogFiles(table.getMetaClient().getFs(), - FSUtils.getPartitionPath(config.getBasePath(), partitionPathStr), fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), baseCommitTime) - .collect(Collectors.toMap(HoodieLogFile::getFileStatus, value -> value.getFileStatus().getLen())); - } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java new file mode 100644 index 0000000000000..e33dffcb7b953 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RestorePlanActionExecutor.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieInstantInfo; +import org.apache.hudi.avro.model.HoodieRestorePlan; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.util.ClusteringUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.BaseActionExecutor; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Plans the restore action and add a restore.requested meta file to timeline. + */ +public class RestorePlanActionExecutor extends BaseActionExecutor> { + + + private static final Logger LOG = LogManager.getLogger(RestorePlanActionExecutor.class); + + public static final Integer RESTORE_PLAN_VERSION_1 = 1; + public static final Integer LATEST_RESTORE_PLAN_VERSION = RESTORE_PLAN_VERSION_1; + private final String restoreInstantTime; + + public RestorePlanActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime, + String restoreInstantTime) { + super(context, config, table, instantTime); + this.restoreInstantTime = restoreInstantTime; + } + + @Override + public Option execute() { + final HoodieInstant restoreInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.RESTORE_ACTION, instantTime); + try { + // Get all the commits on the timeline after the provided commit time + // rollback pending clustering instants first before other instants (See HUDI-3362) + List pendingClusteringInstantsToRollback = table.getActiveTimeline().filterPendingReplaceTimeline() + // filter only clustering related replacecommits (Not insert_overwrite related commits) + .filter(instant -> ClusteringUtils.isPendingClusteringInstant(table.getMetaClient(), instant)) + .getReverseOrderedInstants() + .filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)) + .collect(Collectors.toList()); + + // Get all the commits on the timeline after the provided commit time + List commitInstantsToRollback = table.getActiveTimeline().getWriteTimeline() + .getReverseOrderedInstants() + .filter(instant -> HoodieActiveTimeline.GREATER_THAN.test(instant.getTimestamp(), restoreInstantTime)) + .filter(instant -> !pendingClusteringInstantsToRollback.contains(instant)) + .collect(Collectors.toList()); + + // Combine both lists - first rollback pending clustering and then rollback all other commits + List instantsToRollback = Stream.concat(pendingClusteringInstantsToRollback.stream(), commitInstantsToRollback.stream()) + .map(entry -> new HoodieInstantInfo(entry.getTimestamp(), entry.getAction())) + .collect(Collectors.toList()); + + HoodieRestorePlan restorePlan = new HoodieRestorePlan(instantsToRollback, LATEST_RESTORE_PLAN_VERSION); + table.getActiveTimeline().saveToRestoreRequested(restoreInstant, TimelineMetadataUtils.serializeRestorePlan(restorePlan)); + table.getMetaClient().reloadActiveTimeline(); + LOG.info("Requesting Restore with instant time " + restoreInstant); + return Option.of(restorePlan); + } catch (IOException e) { + LOG.error("Got exception when saving restore requested file", e); + throw new HoodieIOException(e.getMessage(), e); + } + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java index a4b59a88b92c4..2bc9b59b0d1f1 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.action.rollback; +import org.apache.hadoop.fs.FileStatus; import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.common.HoodieRollbackStat; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -33,12 +34,8 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.HoodieTable; - -import org.apache.hadoop.fs.FileStatus; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -48,8 +45,11 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.function.Function; import java.util.stream.Collectors; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; + public class RollbackUtils { private static final Logger LOG = LogManager.getLogger(RollbackUtils.class); @@ -88,7 +88,7 @@ static Map generateHeader(String inst * @return Merged HoodieRollbackStat */ static HoodieRollbackStat mergeRollbackStat(HoodieRollbackStat stat1, HoodieRollbackStat stat2) { - ValidationUtils.checkArgument(stat1.getPartitionPath().equals(stat2.getPartitionPath())); + checkArgument(stat1.getPartitionPath().equals(stat2.getPartitionPath())); final List successDeleteFiles = new ArrayList<>(); final List failedDeleteFiles = new ArrayList<>(); final Map commandBlocksCount = new HashMap<>(); @@ -99,9 +99,7 @@ static HoodieRollbackStat mergeRollbackStat(HoodieRollbackStat stat1, HoodieRoll Option.ofNullable(stat2.getFailedDeleteFiles()).ifPresent(failedDeleteFiles::addAll); Option.ofNullable(stat1.getCommandBlocksCount()).ifPresent(commandBlocksCount::putAll); Option.ofNullable(stat2.getCommandBlocksCount()).ifPresent(commandBlocksCount::putAll); - Option.ofNullable(stat1.getWrittenLogFileSizeMap()).ifPresent(writtenLogFileSizeMap::putAll); - Option.ofNullable(stat2.getWrittenLogFileSizeMap()).ifPresent(writtenLogFileSizeMap::putAll); - return new HoodieRollbackStat(stat1.getPartitionPath(), successDeleteFiles, failedDeleteFiles, commandBlocksCount, writtenLogFileSizeMap); + return new HoodieRollbackStat(stat1.getPartitionPath(), successDeleteFiles, failedDeleteFiles, commandBlocksCount); } /** @@ -191,28 +189,22 @@ public static List generateRollbackRequestsUsingFil // (B.3) Rollback triggered for first commit - Same as (B.1) // (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files // as well if the base base file gets deleted. - try { - HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - table.getMetaClient().getCommitTimeline() - .getInstantDetails(new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp())) - .get(), - HoodieCommitMetadata.class); + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + table.getMetaClient().getCommitTimeline().getInstantDetails(instantToRollback).get(), + HoodieCommitMetadata.class); - // In case all data was inserts and the commit failed, delete the file belonging to that commit - // We do not know fileIds for inserts (first inserts are either log files or base files), - // delete all files for the corresponding failed commit, if present (same as COW) - partitionRollbackRequests.add( - ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath)); + // In case all data was inserts and the commit failed, delete the file belonging to that commit + // We do not know fileIds for inserts (first inserts are either log files or base files), + // delete all files for the corresponding failed commit, if present (same as COW) + partitionRollbackRequests.add( + ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath)); - // append rollback blocks for updates - if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) { - partitionRollbackRequests - .addAll(generateAppendRollbackBlocksAction(partitionPath, instantToRollback, commitMetadata, table)); - } - break; - } catch (IOException io) { - throw new HoodieIOException("Failed to collect rollback actions for commit " + commit, io); + // append rollback blocks for updates and inserts as A.2 and B.2 + if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) { + partitionRollbackRequests + .addAll(generateAppendRollbackBlocksAction(partitionPath, instantToRollback, commitMetadata, table)); } + break; default: break; } @@ -222,7 +214,7 @@ public static List generateRollbackRequestsUsingFil private static List generateAppendRollbackBlocksAction(String partitionPath, HoodieInstant rollbackInstant, HoodieCommitMetadata commitMetadata, HoodieTable table) { - ValidationUtils.checkArgument(rollbackInstant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)); + checkArgument(rollbackInstant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)); // wStat.getPrevCommit() might not give the right commit time in the following // scenario : If a compaction was scheduled, the new commitTime associated with the requested compaction will be @@ -230,30 +222,40 @@ private static List generateAppendRollbackBlocksAct // But the index (global) might store the baseCommit of the base and not the requested, hence get the // baseCommit always by listing the file slice // With multi writers, rollbacks could be lazy. and so we need to use getLatestFileSlicesBeforeOrOn() instead of getLatestFileSlices() - Map fileIdToBaseCommitTimeForLogMap = table.getSliceView().getLatestFileSlicesBeforeOrOn(partitionPath, rollbackInstant.getTimestamp(), - true).collect(Collectors.toMap(FileSlice::getFileId, FileSlice::getBaseInstantTime)); + Map latestFileSlices = table.getSliceView() + .getLatestFileSlicesBeforeOrOn(partitionPath, rollbackInstant.getTimestamp(), true) + .collect(Collectors.toMap(FileSlice::getFileId, Function.identity())); + + return commitMetadata.getPartitionToWriteStats().get(partitionPath) + .stream() + .filter(writeStat -> { + // Filter out stats without prevCommit since they are all inserts + boolean validForRollback = (writeStat != null) && (!writeStat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) + && (writeStat.getPrevCommit() != null) && latestFileSlices.containsKey(writeStat.getFileId()); - return commitMetadata.getPartitionToWriteStats().get(partitionPath).stream().filter(wStat -> { + if (!validForRollback) { + return false; + } - // Filter out stats without prevCommit since they are all inserts - boolean validForRollback = (wStat != null) && (!wStat.getPrevCommit().equals(HoodieWriteStat.NULL_COMMIT)) - && (wStat.getPrevCommit() != null) && fileIdToBaseCommitTimeForLogMap.containsKey(wStat.getFileId()); + FileSlice latestFileSlice = latestFileSlices.get(writeStat.getFileId()); - if (validForRollback) { - // For sanity, log instant time can never be less than base-commit on which we are rolling back - ValidationUtils - .checkArgument(HoodieTimeline.compareTimestamps(fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId()), - HoodieTimeline.LESSER_THAN_OR_EQUALS, rollbackInstant.getTimestamp())); - } + // For sanity, log-file base-instant time can never be less than base-commit on which we are rolling back + checkArgument( + HoodieTimeline.compareTimestamps(latestFileSlice.getBaseInstantTime(), + HoodieTimeline.LESSER_THAN_OR_EQUALS, rollbackInstant.getTimestamp()), + "Log-file base-instant could not be less than the instant being rolled back"); - return validForRollback && HoodieTimeline.compareTimestamps(fileIdToBaseCommitTimeForLogMap.get( - // Base Ts should be strictly less. If equal (for inserts-to-logs), the caller employs another option - // to delete and we should not step on it - wStat.getFileId()), HoodieTimeline.LESSER_THAN, rollbackInstant.getTimestamp()); - }).map(wStat -> { - String baseCommitTime = fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId()); - return ListingBasedRollbackRequest.createRollbackRequestWithAppendRollbackBlockAction(partitionPath, wStat.getFileId(), - baseCommitTime); - }).collect(Collectors.toList()); + // Command block "rolling back" the preceding block {@link HoodieCommandBlockTypeEnum#ROLLBACK_PREVIOUS_BLOCK} + // w/in the latest file-slice is appended iff base-instant of the log-file is _strictly_ less + // than the instant of the Delta Commit being rolled back. Otherwise, log-file will be cleaned up + // in a different branch of the flow. + return HoodieTimeline.compareTimestamps(latestFileSlice.getBaseInstantTime(), HoodieTimeline.LESSER_THAN, rollbackInstant.getTimestamp()); + }) + .map(writeStat -> { + FileSlice latestFileSlice = latestFileSlices.get(writeStat.getFileId()); + return ListingBasedRollbackRequest.createRollbackRequestWithAppendRollbackBlockAction(partitionPath, + writeStat.getFileId(), latestFileSlice.getBaseInstantTime(), writeStat); + }) + .collect(Collectors.toList()); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java index acd1c50badbc7..8f19692ed7c72 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/rollback/SerializableHoodieRollbackRequest.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.model.HoodieRollbackRequest; +import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -29,7 +30,7 @@ * HoodieRollbackRequest in HoodieRollbackPlan (avro pojo) is not operable direclty within spark parallel engine. * Hence converting the same to this {@link SerializableHoodieRollbackRequest} and then using it within spark.parallelize. */ -public class SerializableHoodieRollbackRequest { +public class SerializableHoodieRollbackRequest implements Serializable { private final String partitionPath; private final String fileId; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java index 24b9d6f5da6f4..45bbd78c3fb36 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/DowngradeHandler.java @@ -35,10 +35,10 @@ public interface DowngradeHandler { * @param config instance of {@link HoodieWriteConfig} to be used. * @param context instance of {@link HoodieEngineContext} to be used. * @param instantTime current instant time that should not touched. - * @param upgradeDowngradeHelper instance of {@link BaseUpgradeDowngradeHelper} to be used. + * @param upgradeDowngradeHelper instance of {@link SupportsUpgradeDowngrade} to be used. * @return Map of config properties and its values to be added to table properties. */ Map downgrade( HoodieWriteConfig config, HoodieEngineContext context, String instantTime, - BaseUpgradeDowngradeHelper upgradeDowngradeHelper); + SupportsUpgradeDowngrade upgradeDowngradeHelper); } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaClusteringPlanActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToThreeDowngradeHandler.java similarity index 52% rename from hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaClusteringPlanActionExecutor.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToThreeDowngradeHandler.java index 1d78ecc2bf41c..17dc01d0213e7 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaClusteringPlanActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/FourToThreeDowngradeHandler.java @@ -17,27 +17,22 @@ * under the License. */ -package org.apache.hudi.table.action.cluster; +package org.apache.hudi.table.upgrade; -import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import java.util.List; +import java.util.Collections; import java.util.Map; -public class JavaClusteringPlanActionExecutor extends - BaseClusteringPlanActionExecutor>, List, List> { +/** + * DowngradeHandler to assist in downgrading {@link org.apache.hudi.table.HoodieTable} from version 4 to 3. + */ +public class FourToThreeDowngradeHandler implements DowngradeHandler { - public JavaClusteringPlanActionExecutor( - HoodieEngineContext context, HoodieWriteConfig config, - HoodieTable>, List, List> table, - String instantTime, Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); + @Override + public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { + return Collections.emptyMap(); } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java index efa0fe472c52c..dbf4d6159dcbd 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToTwoUpgradeHandler.java @@ -35,7 +35,7 @@ public class OneToTwoUpgradeHandler implements UpgradeHandler { @Override public Map upgrade( HoodieWriteConfig config, HoodieEngineContext context, String instantTime, - BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + SupportsUpgradeDowngrade upgradeDowngradeHelper) { Map tablePropsToAdd = new Hashtable<>(); tablePropsToAdd.put(HoodieTableConfig.PARTITION_FIELDS, upgradeDowngradeHelper.getPartitionColumns(config)); tablePropsToAdd.put(HoodieTableConfig.RECORDKEY_FIELDS, config.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java index e6051cf321b50..14fe8e2b88713 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/OneToZeroDowngradeHandler.java @@ -40,7 +40,7 @@ public class OneToZeroDowngradeHandler implements DowngradeHandler { @Override public Map downgrade( HoodieWriteConfig config, HoodieEngineContext context, String instantTime, - BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + SupportsUpgradeDowngrade upgradeDowngradeHelper) { HoodieTable table = upgradeDowngradeHelper.getTable(config, context); // fetch pending commit info HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseUpgradeDowngradeHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/SupportsUpgradeDowngrade.java similarity index 97% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseUpgradeDowngradeHelper.java rename to hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/SupportsUpgradeDowngrade.java index d3f157be954da..5e6b9db913fa5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/BaseUpgradeDowngradeHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/SupportsUpgradeDowngrade.java @@ -26,7 +26,7 @@ /** * Interface for engine-specific logic needed for upgrade and downgrade actions. */ -public interface BaseUpgradeDowngradeHelper { +public interface SupportsUpgradeDowngrade { /** * @param config Write config. * @param context {@link HoodieEngineContext} instance to use. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java new file mode 100644 index 0000000000000..72e96bb4103bc --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToFourUpgradeHandler.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.table.upgrade; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.table.HoodieTableConfig; +import org.apache.hudi.config.HoodieWriteConfig; + +import java.util.Hashtable; +import java.util.Map; + +/** + * UpgradeHandler to assist in upgrading {@link org.apache.hudi.table.HoodieTable} from version 3 to 4. + */ +public class ThreeToFourUpgradeHandler implements UpgradeHandler { + + @Override + public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { + Map tablePropsToAdd = new Hashtable<>(); + tablePropsToAdd.put(HoodieTableConfig.TABLE_CHECKSUM, String.valueOf(HoodieTableConfig.generateChecksum(config.getProps()))); + return tablePropsToAdd; + } +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java index 964859c0ae07d..4f209f05ffc9b 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ThreeToTwoDowngradeHandler.java @@ -33,7 +33,7 @@ public class ThreeToTwoDowngradeHandler implements DowngradeHandler { @Override - public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + public Map downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { if (config.isMetadataTableEnabled()) { // Metadata Table in version 3 is synchronous and in version 2 is asynchronous. Downgrading to asynchronous // removes the checks in code to decide whether to use a LogBlock or not. Also, the schema for the diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java index ee638a16f8633..de1a1067fe111 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToOneDowngradeHandler.java @@ -54,7 +54,7 @@ public class TwoToOneDowngradeHandler implements DowngradeHandler { @Override public Map downgrade( HoodieWriteConfig config, HoodieEngineContext context, String instantTime, - BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + SupportsUpgradeDowngrade upgradeDowngradeHelper) { HoodieTable table = upgradeDowngradeHelper.getTable(config, context); HoodieTableMetaClient metaClient = table.getMetaClient(); @@ -115,9 +115,11 @@ private void convertToDirectMarkers(final String commitInstantTime, + "\" is not supported for rollback."); } } else { - // In case of partial failures during downgrade, there is a chance that marker type file was deleted, - // but timeline server based marker files are left. So deletes them if any - deleteTimelineBasedMarkerFiles(context, markerDir, fileSystem, parallelism); + if (fileSystem.exists(new Path(markerDir))) { + // In case of partial failures during downgrade, there is a chance that marker type file was deleted, + // but timeline server based marker files are left. So deletes them if any + deleteTimelineBasedMarkerFiles(context, markerDir, fileSystem, parallelism); + } } } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java index bff3788d56cfe..c13d21ec201a0 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/TwoToThreeUpgradeHandler.java @@ -35,7 +35,7 @@ */ public class TwoToThreeUpgradeHandler implements UpgradeHandler { @Override - public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + public Map upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) { if (config.isMetadataTableEnabled()) { // Metadata Table in version 2 is asynchronous and in version 3 is synchronous. Synchronous table will not // sync any instants not already synced. So its simpler to re-bootstrap the table. Also, the schema for the diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java index 0e8f752a8f682..1a75ff51cabd4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java @@ -42,7 +42,7 @@ public class UpgradeDowngrade { private static final Logger LOG = LogManager.getLogger(UpgradeDowngrade.class); public static final String HOODIE_UPDATED_PROPERTY_FILE = "hoodie.properties.updated"; - private final BaseUpgradeDowngradeHelper upgradeDowngradeHelper; + private final SupportsUpgradeDowngrade upgradeDowngradeHelper; private HoodieTableMetaClient metaClient; protected HoodieWriteConfig config; protected HoodieEngineContext context; @@ -52,7 +52,7 @@ public class UpgradeDowngrade { public UpgradeDowngrade( HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context, - BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + SupportsUpgradeDowngrade upgradeDowngradeHelper) { this.metaClient = metaClient; this.config = config; this.context = context; @@ -143,6 +143,8 @@ protected Map upgrade(HoodieTableVersion fromVersion, Ho return new OneToTwoUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); } else if (fromVersion == HoodieTableVersion.TWO && toVersion == HoodieTableVersion.THREE) { return new TwoToThreeUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.THREE && toVersion == HoodieTableVersion.FOUR) { + return new ThreeToFourUpgradeHandler().upgrade(config, context, instantTime, upgradeDowngradeHelper); } else { throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), true); } @@ -155,6 +157,8 @@ protected Map downgrade(HoodieTableVersion fromVersion, return new TwoToOneDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); } else if (fromVersion == HoodieTableVersion.THREE && toVersion == HoodieTableVersion.TWO) { return new ThreeToTwoDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); + } else if (fromVersion == HoodieTableVersion.FOUR && toVersion == HoodieTableVersion.THREE) { + return new FourToThreeDowngradeHandler().downgrade(config, context, instantTime, upgradeDowngradeHelper); } else { throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), false); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java index 9dc477ffc9dc6..147aa4d8ab2dd 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeHandler.java @@ -35,10 +35,10 @@ public interface UpgradeHandler { * @param config instance of {@link HoodieWriteConfig} to be used. * @param context instance of {@link HoodieEngineContext} to be used. * @param instantTime current instant time that should not be touched. - * @param upgradeDowngradeHelper instance of {@link BaseUpgradeDowngradeHelper} to be used. + * @param upgradeDowngradeHelper instance of {@link SupportsUpgradeDowngrade} to be used. * @return Map of config properties and its values to be added to table properties. */ Map upgrade( HoodieWriteConfig config, HoodieEngineContext context, String instantTime, - BaseUpgradeDowngradeHelper upgradeDowngradeHelper); + SupportsUpgradeDowngrade upgradeDowngradeHelper); } diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java index 1aebbf6b4c42d..6a114154c8778 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/ZeroToOneUpgradeHandler.java @@ -57,7 +57,7 @@ public class ZeroToOneUpgradeHandler implements UpgradeHandler { @Override public Map upgrade( HoodieWriteConfig config, HoodieEngineContext context, String instantTime, - BaseUpgradeDowngradeHelper upgradeDowngradeHelper) { + SupportsUpgradeDowngrade upgradeDowngradeHelper) { // fetch pending commit info HoodieTable table = upgradeDowngradeHelper.getTable(config, context); HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction(); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestAsyncArchiveService.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestAsyncArchiveService.java new file mode 100644 index 0000000000000..9dad8b8020a1f --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestAsyncArchiveService.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.client.BaseHoodieWriteClient; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.concurrent.ExecutionException; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class TestAsyncArchiveService { + + @Mock + BaseHoodieWriteClient writeClient; + @Mock + HoodieWriteConfig config; + + @Test + void startAsyncArchiveReturnsNullWhenAutoArchiveDisabled() { + when(config.isAutoArchive()).thenReturn(false); + when(writeClient.getConfig()).thenReturn(config); + assertNull(AsyncArchiveService.startAsyncArchiveIfEnabled(writeClient)); + } + + @Test + void startAsyncArchiveReturnsNullWhenAsyncArchiveDisabled() { + when(config.isAutoArchive()).thenReturn(true); + when(config.isAsyncArchive()).thenReturn(false); + when(writeClient.getConfig()).thenReturn(config); + assertNull(AsyncArchiveService.startAsyncArchiveIfEnabled(writeClient)); + } + + @Test + void startAsyncArchiveIfEnabled() { + when(config.isAutoArchive()).thenReturn(true); + when(config.isAsyncArchive()).thenReturn(true); + when(writeClient.getConfig()).thenReturn(config); + assertNotNull(AsyncArchiveService.startAsyncArchiveIfEnabled(writeClient)); + } + + @Test + void startServiceShouldInvokeCallArchiveMethod() throws ExecutionException, InterruptedException { + AsyncArchiveService service = new AsyncArchiveService(writeClient); + assertEquals(true, service.startService().getLeft().get()); + verify(writeClient).archive(); + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestHoodieAsyncTableService.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestHoodieAsyncTableService.java new file mode 100644 index 0000000000000..0c19576d042bf --- /dev/null +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/async/TestHoodieAsyncTableService.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.async; + +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieWriteConfig; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class TestHoodieAsyncTableService { + + @Test + void tableServiceShouldNotStartIfDisabled(@Mock HoodieWriteConfig config) { + when(config.areTableServicesEnabled()).thenReturn(false); + HoodieAsyncTableService service = new DummyAsyncTableService(config); + service.start(null); + assertFalse(service.isStarted()); + } + + private static class DummyAsyncTableService extends HoodieAsyncTableService { + + protected DummyAsyncTableService(HoodieWriteConfig writeConfig) { + super(writeConfig); + } + + @Override + protected Pair startService() { + return null; + } + } +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestInProcessLockProvider.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestInProcessLockProvider.java index 9e7472c13db98..6c245787449d9 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestInProcessLockProvider.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestInProcessLockProvider.java @@ -238,7 +238,7 @@ public void testRedundantUnlock() { assertDoesNotThrow(() -> { inProcessLockProvider.unlock(); }); - assertThrows(HoodieLockException.class, () -> { + assertDoesNotThrow(() -> { inProcessLockProvider.unlock(); }); } @@ -246,7 +246,7 @@ public void testRedundantUnlock() { @Test public void testUnlockWithoutLock() { InProcessLockProvider inProcessLockProvider = new InProcessLockProvider(lockConfiguration, hadoopConfiguration); - assertThrows(HoodieLockException.class, () -> { + assertDoesNotThrow(() -> { inProcessLockProvider.unlock(); }); } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java index a1a7f6a3122d0..22f8017841a83 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/client/transaction/TestTransactionManager.java @@ -81,7 +81,7 @@ public void testSingleWriterNestedTransaction() { }); transactionManager.endTransaction(); - assertThrows(HoodieLockException.class, () -> { + assertDoesNotThrow(() -> { transactionManager.endTransaction(); }); } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java index c86a34a609f1b..2c3ae98c6e6be 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/config/TestHoodieWriteConfig.java @@ -82,7 +82,7 @@ public void testPropertyLoading(boolean withAlternative) throws IOException { public void testDefaultIndexAccordingToEngineType() { testEngineSpecificConfig(HoodieWriteConfig::getIndexType, constructConfigMap( - EngineType.SPARK, HoodieIndex.IndexType.BLOOM, + EngineType.SPARK, HoodieIndex.IndexType.SIMPLE, EngineType.FLINK, HoodieIndex.IndexType.INMEMORY, EngineType.JAVA, HoodieIndex.IndexType.INMEMORY)); } @@ -167,7 +167,7 @@ public void testDefaultLockProviderWhenAsyncServicesEnabled() { } }); assertFalse(writeConfig.areAnyTableServicesAsync()); - assertTrue(writeConfig.areAnyTableServicesInline()); + assertTrue(writeConfig.areAnyTableServicesExecutedInline()); assertEquals(HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.defaultValue(), writeConfig.getLockProviderClass()); // 5. User override for the lock provider should always take the precedence diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java index 86a0886de664d..fd25d92cba62e 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.bloom.BloomFilterTypeCode; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -103,7 +104,7 @@ private HoodieHFileWriter createHFileWriter(Schema avroSchema, boolean populateM String instantTime = "000"; HoodieHFileConfig hoodieHFileConfig = new HoodieHFileConfig(conf, Compression.Algorithm.GZ, 1024 * 1024, 120 * 1024 * 1024, - PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR); + HoodieHFileReader.KEY_FIELD_NAME, PREFETCH_ON_OPEN, CACHE_DATA_IN_L1, DROP_BEHIND_CACHE_COMPACTION, filter, HFILE_COMPARATOR); return new HoodieHFileWriter(instantTime, filePath, hoodieHFileConfig, avroSchema, mockTaskContextSupplier, populateMetaFields); } @@ -122,7 +123,7 @@ public void testWriteReadHFile(boolean populateMetaFields, boolean testAvroWithM record.put("time", Integer.toString(RANDOM.nextInt())); record.put("number", i); if (testAvroWithMeta) { - writer.writeAvroWithMetadata(record, new HoodieRecord(new HoodieKey((String) record.get("_row_key"), + writer.writeAvroWithMetadata(record, new HoodieAvroRecord(new HoodieKey((String) record.get("_row_key"), Integer.toString((Integer) record.get("number"))), new EmptyHoodieRecordPayload())); // payload does not matter. GenericRecord passed in is what matters // only HoodieKey will be looked up from the 2nd arg(HoodieRecord). } else { @@ -170,4 +171,4 @@ private Set getRandomKeys(int count, List keys) { } return rowKeys; } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestMetricsReporterFactory.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestMetricsReporterFactory.java index edd2302a6ecc7..3689755e4447d 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestMetricsReporterFactory.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/metrics/TestMetricsReporterFactory.java @@ -21,10 +21,10 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.metrics.custom.CustomizableMetricsReporter; import com.codahale.metrics.MetricRegistry; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.metrics.userdefined.AbstractUserDefinedMetricsReporter; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -63,7 +63,7 @@ public void metricsReporterFactoryShouldReturnUserDefinedReporter() { when(config.getProps()).thenReturn(props); MetricsReporter reporter = MetricsReporterFactory.createReporter(config, registry); - assertTrue(reporter instanceof AbstractUserDefinedMetricsReporter); + assertTrue(reporter instanceof CustomizableMetricsReporter); assertEquals(props, ((DummyMetricsReporter) reporter).getProps()); assertEquals(registry, ((DummyMetricsReporter) reporter).getRegistry()); } @@ -75,7 +75,7 @@ public void metricsReporterFactoryShouldThrowExceptionWhenMetricsReporterClassIs assertThrows(HoodieException.class, () -> MetricsReporterFactory.createReporter(config, registry)); } - public static class DummyMetricsReporter extends AbstractUserDefinedMetricsReporter { + public static class DummyMetricsReporter extends CustomizableMetricsReporter { public DummyMetricsReporter(Properties props, MetricRegistry registry) { super(props, registry); diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java index 32fd200145e9b..3488a1365ce88 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; @@ -41,6 +42,7 @@ import org.apache.hudi.io.storage.HoodieOrcConfig; import org.apache.hudi.io.storage.HoodieOrcWriter; import org.apache.hudi.io.storage.HoodieParquetWriter; +import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; @@ -48,7 +50,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.orc.CompressionKind; @@ -118,7 +119,7 @@ public HoodieWriteableTestTable withInserts(String partition, String fileId, Lis config, schema, contextSupplier, populateMetaFields)) { int seqId = 1; for (HoodieRecord record : records) { - GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); + GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get(); if (populateMetaFields) { HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++)); HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName); @@ -141,7 +142,7 @@ public HoodieWriteableTestTable withInserts(String partition, String fileId, Lis config, schema, contextSupplier)) { int seqId = 1; for (HoodieRecord record : records) { - GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); + GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get(); HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, currentInstantTime, String.valueOf(seqId++)); HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), fileName); writer.writeAvro(record.getRecordKey(), avroRecord); @@ -175,14 +176,14 @@ private Pair appendRecordsToLogFile(List gr header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> { try { - GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); + GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get(); HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); return (IndexedRecord) val; } catch (IOException e) { LOG.warn("Failed to convert record " + r.toString(), e); return null; } - }).collect(Collectors.toList()), header)); + }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD)); return Pair.of(partitionPath, logWriter.getLogFile()); } } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java index 9bc559deb5ba4..f67e158c8395d 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/providers/HoodieWriteClientProvider.java @@ -19,12 +19,12 @@ package org.apache.hudi.testutils.providers; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.config.HoodieWriteConfig; import java.io.IOException; public interface HoodieWriteClientProvider { - AbstractHoodieWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) throws IOException; + BaseHoodieWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) throws IOException; } diff --git a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java index c0952bc5a7204..415c12a6407c6 100644 --- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/utils/TestMetadataConversionUtils.java @@ -188,7 +188,6 @@ private void createRollbackMetadata(String instantTime) throws Exception { rollbackPartitionMetadata.setPartitionPath("p1"); rollbackPartitionMetadata.setSuccessDeleteFiles(Arrays.asList("f1")); rollbackPartitionMetadata.setFailedDeleteFiles(new ArrayList<>()); - rollbackPartitionMetadata.setWrittenLogFiles(new HashMap<>()); rollbackPartitionMetadata.setRollbackLogFiles(new HashMap<>()); Map partitionMetadataMap = new HashMap<>(); partitionMetadataMap.put("p1", rollbackPartitionMetadata); diff --git a/hudi-client/hudi-flink-client/pom.xml b/hudi-client/hudi-flink-client/pom.xml index b2cc6949bb08b..b6f1f3d372d28 100644 --- a/hudi-client/hudi-flink-client/pom.xml +++ b/hudi-client/hudi-flink-client/pom.xml @@ -60,7 +60,7 @@ org.apache.flink - flink-table-runtime-blink_${scala.binary.version} + flink-table-runtime_${scala.binary.version} ${flink.version} provided @@ -159,7 +159,7 @@ org.apache.flink - flink-runtime_${scala.binary.version} + flink-runtime ${flink.version} test tests diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java index 4108ba425e8ca..1f5d14af744fb 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/client/HoodieFlinkWriteClient.java @@ -18,6 +18,7 @@ package org.apache.hudi.client; +import org.apache.hudi.async.AsyncCleanerService; import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -39,7 +40,6 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieCommitException; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.hudi.index.FlinkHoodieIndexFactory; import org.apache.hudi.index.HoodieIndex; @@ -68,7 +68,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.text.ParseException; import java.util.HashMap; import java.util.Iterator; @@ -78,7 +77,7 @@ @SuppressWarnings("checkstyle:LineLength") public class HoodieFlinkWriteClient extends - AbstractHoodieWriteClient>, List, List> { + BaseHoodieWriteClient>, List, List> { private static final Logger LOG = LoggerFactory.getLogger(HoodieFlinkWriteClient.class); @@ -334,10 +333,7 @@ protected void postCommit(HoodieTable>, List, // Delete the marker directory for the instant. WriteMarkersFactory.get(config.getMarkersType(), createTable(config, hadoopConf), instantTime) .quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism()); - if (config.isAutoArchive()) { - // We cannot have unbounded commit files. Archive commits if we have to archive - archive(table); - } + autoArchiveOnCommit(table); } finally { this.heartbeatClient.stop(instantTime); } @@ -346,23 +342,20 @@ protected void postCommit(HoodieTable>, List, @Override public void commitCompaction( String compactionInstantTime, - List writeStatuses, - Option> extraMetadata) throws IOException { + HoodieCommitMetadata metadata, + Option> extraMetadata) { HoodieFlinkTable table = getHoodieTable(); - HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata( - table, compactionInstantTime, HoodieList.of(writeStatuses), config.getSchema()); extraMetadata.ifPresent(m -> m.forEach(metadata::addMetadata)); - completeCompaction(metadata, writeStatuses, table, compactionInstantTime); + completeCompaction(metadata, table, compactionInstantTime); } @Override public void completeCompaction( HoodieCommitMetadata metadata, - List writeStatuses, HoodieTable>, List, List> table, String compactionCommitTime) { this.context.setJobStatus(this.getClass().getSimpleName(), "Collect compaction write status and commit compaction"); - List writeStats = writeStatuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()); + List writeStats = metadata.getWriteStats(); final HoodieInstant compactionInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionCommitTime); try { this.txnManager.beginTransaction(Option.of(compactionInstant), Option.empty()); @@ -391,16 +384,11 @@ public void completeCompaction( } @Override - protected List compact(String compactionInstantTime, boolean shouldComplete) { + protected HoodieWriteMetadata> compact(String compactionInstantTime, boolean shouldComplete) { // only used for metadata table, the compaction happens in single thread - try { - List writeStatuses = - getHoodieTable().compact(context, compactionInstantTime).getWriteStatuses(); - commitCompaction(compactionInstantTime, writeStatuses, Option.empty()); - return writeStatuses; - } catch (IOException e) { - throw new HoodieException("Error while compacting instant: " + compactionInstantTime); - } + HoodieWriteMetadata> compactionMetadata = getHoodieTable().compact(context, compactionInstantTime); + commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); + return compactionMetadata; } @Override diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java index 847a2183a156d..66c1b07793ee7 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndex.java @@ -25,7 +25,6 @@ import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; @@ -33,12 +32,12 @@ import org.apache.hudi.table.HoodieTable; import java.util.List; +import java.util.stream.Collectors; /** * Base flink implementation of {@link HoodieIndex}. - * @param payload type */ -public abstract class FlinkHoodieIndex extends HoodieIndex>, List, List> { +public abstract class FlinkHoodieIndex extends HoodieIndex>, List> { protected FlinkHoodieIndex(HoodieWriteConfig config) { super(config); } @@ -48,21 +47,22 @@ protected FlinkHoodieIndex(HoodieWriteConfig config) { @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List updateLocation(List writeStatuses, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; @Override @Deprecated @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List> tagLocation(List> records, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; @Override @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) throws HoodieIndexException { - return HoodieList.of(tagLocation(HoodieList.getList(records), context, hoodieTable)); + List> hoodieRecords = tagLocation(HoodieList.getList(records.map(record -> (HoodieRecord) record)), context, hoodieTable); + return HoodieList.of(hoodieRecords.stream().map(r -> (HoodieRecord) r).collect(Collectors.toList())); } @Override diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java index a9196ca9a3d20..54110d93506d9 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/FlinkHoodieIndexFactory.java @@ -46,11 +46,11 @@ public static HoodieIndex createIndex(HoodieFlinkEngineContext context, HoodieWr // TODO more indexes to be added switch (config.getIndexType()) { case INMEMORY: - return new FlinkInMemoryStateIndex<>(context, config); + return new FlinkInMemoryStateIndex(context, config); case BLOOM: - return new HoodieBloomIndex<>(config, ListBasedHoodieBloomIndexHelper.getInstance()); + return new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); case SIMPLE: - return new HoodieSimpleIndex<>(config, Option.empty()); + return new HoodieSimpleIndex(config, Option.empty()); default: throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java index aa779c4252fcd..af9785edbeb0c 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/index/state/FlinkInMemoryStateIndex.java @@ -22,9 +22,7 @@ import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.index.HoodieIndex; @@ -37,11 +35,8 @@ /** * Hoodie index implementation backed by flink state. - * - * @param type of payload */ -public class FlinkInMemoryStateIndex> - extends HoodieIndex>, List, List> { +public class FlinkInMemoryStateIndex extends HoodieIndex, List> { private static final Logger LOG = LogManager.getLogger(FlinkInMemoryStateIndex.class); @@ -50,8 +45,8 @@ public FlinkInMemoryStateIndex(HoodieFlinkEngineContext context, HoodieWriteConf } @Override - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) throws HoodieIndexException { throw new UnsupportedOperationException("No need to tag location for FlinkInMemoryStateIndex"); } @@ -88,4 +83,4 @@ public boolean canIndexLogFiles() { public boolean isImplicitWithStorage() { return true; } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java index af9fee0688049..275ab4f5e0a33 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java @@ -22,11 +22,10 @@ import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.metrics.Registry; -import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; @@ -41,7 +40,7 @@ import java.io.IOException; import java.util.Collections; import java.util.List; -import java.util.stream.Collectors; +import java.util.Map; public class FlinkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter { @@ -101,10 +100,12 @@ protected void initialize(HoodieEngineContext eng } @Override - protected void commit(HoodieData hoodieDataRecords, String partitionName, String instantTime, boolean canTriggerTableService) { + protected void commit(String instantTime, Map> partitionRecordsMap, + boolean canTriggerTableService) { ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled"); - List records = (List) hoodieDataRecords.get(); - List recordList = prepRecords(records, partitionName, 1); + ValidationUtils.checkState(metadataMetaClient != null, "Metadata table is not fully initialized yet."); + HoodieData preppedRecords = prepRecords(partitionRecordsMap); + List preppedRecordList = HoodieList.getList(preppedRecords); try (HoodieFlinkWriteClient writeClient = new HoodieFlinkWriteClient(engineContext, metadataWriteConfig)) { if (!metadataMetaClient.getActiveTimeline().filterCompletedInstants().containsInstant(instantTime)) { @@ -119,20 +120,21 @@ protected void commit(HoodieData hoodieDataRecords, String partiti // once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is // already part of completed commit. So, we have to manually remove the completed instant and proceed. // and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table. - HoodieInstant alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get(); + HoodieInstant alreadyCompletedInstant = + metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get(); HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant); metadataMetaClient.reloadActiveTimeline(); } - List statuses = records.size() > 0 - ? writeClient.upsertPreppedRecords(recordList, instantTime) + List statuses = preppedRecordList.size() > 0 + ? writeClient.upsertPreppedRecords(preppedRecordList, instantTime) : Collections.emptyList(); statuses.forEach(writeStatus -> { if (writeStatus.hasErrors()) { throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime); } }); - // flink does not support auto-commit yet, also the auto commit logic is not complete as AbstractHoodieWriteClient now. + // flink does not support auto-commit yet, also the auto commit logic is not complete as BaseHoodieWriteClient now. writeClient.commit(instantTime, statuses, Option.empty(), HoodieActiveTimeline.DELTA_COMMIT_ACTION, Collections.emptyMap()); // reload timeline @@ -147,21 +149,4 @@ protected void commit(HoodieData hoodieDataRecords, String partiti // Update total size of the metadata and count of base/log files metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata)); } - - /** - * Tag each record with the location in the given partition. - * - * The record is tagged with respective file slice's location based on its record key. - */ - private List prepRecords(List records, String partitionName, int numFileGroups) { - List fileSlices = HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient, partitionName); - ValidationUtils.checkArgument(fileSlices.size() == numFileGroups, String.format("Invalid number of file groups: found=%d, required=%d", fileSlices.size(), numFileGroups)); - - return records.stream().map(r -> { - FileSlice slice = fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), numFileGroups)); - final String instantTime = slice.isEmpty() ? "I" : "U"; - r.setCurrentLocation(new HoodieRecordLocation(instantTime, slice.getFileId())); - return r; - }).collect(Collectors.toList()); - } -} +} \ No newline at end of file diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java index 9aceffe44f86a..7e41ab150fbf2 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkCopyOnWriteTable.java @@ -23,6 +23,7 @@ import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; @@ -342,6 +343,11 @@ public HoodieSavepointMetadata savepoint(HoodieEngineContext context, String ins throw new HoodieNotSupportedException("Savepoint is not supported yet"); } + @Override + public Option scheduleRestore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { + throw new HoodieNotSupportedException("Restore is not supported yet"); + } + @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { throw new HoodieNotSupportedException("Savepoint and restore is not supported yet"); diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java index 164b00e2d6ce4..2f08a55c956fb 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/HoodieFlinkTable.java @@ -53,7 +53,8 @@ public static HoodieFlinkTable create(HoodieW HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(config.getBasePath()) .setLoadActiveTimelineOnLoad(true).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) - .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))).build(); + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig()).build(); return HoodieFlinkTable.create(config, context, metaClient); } diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java index 5dfa511a8823f..51138cd29daa6 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/BaseFlinkCommitActionExecutor.java @@ -134,6 +134,12 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta commit(extraMetadata, result, result.getWriteStatuses().stream().map(WriteStatus::getStat).collect(Collectors.toList())); } + protected void setCommitMetadata(HoodieWriteMetadata> result) { + result.setCommitMetadata(Option.of(CommitUtils.buildMetadata(result.getWriteStatuses().stream().map(WriteStatus::getStat).collect(Collectors.toList()), + result.getPartitionToReplaceFileIds(), + extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()))); + } + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List writeStats) { String actionType = getCommitActionType(); LOG.info("Committing " + instantTime + ", action Type " + actionType); @@ -144,8 +150,7 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta try { LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType()); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + HoodieCommitMetadata metadata = result.getCommitMetadata().get(); writeTableMetadata(metadata, actionType); diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java index 05ac93725bfc9..8dd0c99bae299 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkDeleteHelper.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.collection.Pair; @@ -43,7 +44,7 @@ @SuppressWarnings("checkstyle:LineLength") public class FlinkDeleteHelper extends - AbstractDeleteHelper>, List, List, R> { + BaseDeleteHelper>, List, List, R> { private FlinkDeleteHelper() { } @@ -93,7 +94,7 @@ public HoodieWriteMetadata> execute(String instantTime, } List> dedupedRecords = - dedupedKeys.stream().map(key -> new HoodieRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); + dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); Instant beginTag = Instant.now(); // perform index look up to get existing location of records List> taggedRecords = HoodieList.getList( diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java index 5ed6d5d529ba3..38d4e60f648ec 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java @@ -45,7 +45,7 @@ import scala.collection.immutable.List; -public class FlinkMergeHelper extends AbstractMergeHelper>, +public class FlinkMergeHelper extends BaseMergeHelper>, List, List> { private FlinkMergeHelper() { @@ -91,7 +91,7 @@ public void runMerge(HoodieTable>, List, List ThreadLocal encoderCache = new ThreadLocal<>(); ThreadLocal decoderCache = new ThreadLocal<>(); - wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), + wrapper = new BoundedInMemoryExecutor<>(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> { if (!externalSchemaTransformation) { return record; diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java index 3914e486f8a86..d28aafcc4abf8 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; @@ -48,7 +49,7 @@ *

Computing the records batch locations all at a time is a pressure to the engine, * we should avoid that in streaming system. */ -public class FlinkWriteHelper extends AbstractWriteHelper>, +public class FlinkWriteHelper extends BaseWriteHelper>, List, List, R> { private FlinkWriteHelper() { @@ -89,7 +90,7 @@ protected List> tag(List> dedupedRecords, Hoodie @Override public List> deduplicateRecords( - List> records, HoodieIndex index, int parallelism) { + List> records, HoodieIndex index, int parallelism) { Map>>> keyedRecords = records.stream().map(record -> { // If index used is global, then records are expected to differ in their partitionPath final Object key = record.getKey().getRecordKey(); @@ -107,7 +108,7 @@ public List> deduplicateRecords( boolean choosePrev = data1.equals(reducedData); HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey(); HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation(); - HoodieRecord hoodieRecord = new HoodieRecord<>(reducedKey, reducedData, operation); + HoodieRecord hoodieRecord = new HoodieAvroRecord<>(reducedKey, reducedData, operation); // reuse the location from the first record. hoodieRecord.setCurrentLocation(rec1.getCurrentLocation()); return hoodieRecord; diff --git a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java index d097d2e60057c..69acce5627543 100644 --- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java +++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/table/upgrade/FlinkUpgradeDowngradeHelper.java @@ -29,7 +29,7 @@ /** * Flink upgrade and downgrade helper. */ -public class FlinkUpgradeDowngradeHelper implements BaseUpgradeDowngradeHelper { +public class FlinkUpgradeDowngradeHelper implements SupportsUpgradeDowngrade { private static final FlinkUpgradeDowngradeHelper SINGLETON_INSTANCE = new FlinkUpgradeDowngradeHelper(); diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java index 7b4e3b675ea05..50adabbd585ea 100644 --- a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.bloom.BloomFilterTypeCode; import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.data.HoodieMapPair; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -31,7 +32,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.io.HoodieKeyLookupHandle; +import org.apache.hudi.index.HoodieIndexUtils; import org.apache.hudi.table.HoodieFlinkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieFlinkClientTestHarness; @@ -115,22 +116,22 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List partitions = asList("2016/01/21", "2016/04/01", "2015/03/12"); - List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + List> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); // Still 0, as no valid commit assertEquals(0, filesList.size()); @@ -140,7 +141,7 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b .withInserts("2015/03/12", "4", record2, record3, record4); metaClient.reloadActiveTimeline(); - filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); assertEquals(4, filesList.size()); if (rangePruning) { @@ -212,16 +213,16 @@ public void testCheckUUIDsAgainstOneFile() throws Exception { + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // We write record1, record2 to a base file, but the bloom filter contains (record1, // record2, record3). @@ -242,9 +243,8 @@ public void testCheckUUIDsAgainstOneFile() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient); - HoodieKeyLookupHandle keyHandle = new HoodieKeyLookupHandle<>(config, table, Pair.of(partition, fileId)); - List results = keyHandle.checkCandidatesAgainstFile(hadoopConf, uuids, - new Path(java.nio.file.Paths.get(basePath, partition, filename).toString())); + List results = HoodieIndexUtils.filterKeysFromFile( + new Path(java.nio.file.Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf); assertEquals(results.size(), 2); assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); @@ -287,16 +287,16 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List records = asList(record1, record2, record3, record4); // Also create the metadata and config @@ -355,15 +355,15 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()); - HoodieRecord record1 = new HoodieRecord(key1, rowChange1); + HoodieRecord record1 = new HoodieAvroRecord(key1, rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()); - HoodieRecord record2 = new HoodieRecord(key2, rowChange2); + HoodieRecord record2 = new HoodieAvroRecord(key2, rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()); - HoodieRecord record4 = new HoodieRecord(key4, rowChange4); + HoodieRecord record4 = new HoodieAvroRecord(key4, rowChange4); List keys = asList(key1, key2, key3, key4); // Also create the metadata and config @@ -374,7 +374,7 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean // Let's tag HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); List toTagRecords = new ArrayList<>(); - toTagRecords.add(new HoodieRecord(record4.getKey(), null)); + toTagRecords.add(new HoodieAvroRecord(record4.getKey(), null)); List taggedRecords = tagLocation(bloomIndex, toTagRecords, hoodieTable); Map>> recordLocations = new HashMap<>(); for (HoodieRecord taggedRecord : taggedRecords) { @@ -397,7 +397,7 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean hoodieTable = HoodieFlinkTable.create(config, context, metaClient); List toTagRecords1 = new ArrayList<>(); for (HoodieKey key : keys) { - taggedRecords.add(new HoodieRecord(key, null)); + taggedRecords.add(new HoodieAvroRecord(key, null)); } taggedRecords = tagLocation(bloomIndex, toTagRecords1, hoodieTable); @@ -437,9 +437,9 @@ public void testBloomFilterFalseError(boolean rangePruning, boolean treeFilterin // We write record1 to a base file, using a bloom filter having both records RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); filter.add(record2.getRecordKey()); diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkWriteableTestTable.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkWriteableTestTable.java index 50e8f776ac635..de95520854646 100644 --- a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkWriteableTestTable.java +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkWriteableTestTable.java @@ -19,6 +19,7 @@ package org.apache.hudi.testutils; +import org.apache.avro.generic.IndexedRecord; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; @@ -26,6 +27,7 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; @@ -39,6 +41,7 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -130,14 +133,14 @@ private Pair appendRecordsToLogFile(List gr header.put(HeaderMetadataType.SCHEMA, schema.toString()); logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> { try { - GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); + GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get(); HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); - return (org.apache.avro.generic.IndexedRecord) val; - } catch (java.io.IOException e) { + return (IndexedRecord) val; + } catch (IOException e) { LOG.warn("Failed to convert record " + r.toString(), e); return null; } - }).collect(Collectors.toList()), header)); + }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD)); return Pair.of(partitionPath, logWriter.getLogFile()); } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java index 212187b2d7552..f365f29329782 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java @@ -44,13 +44,12 @@ import com.codahale.metrics.Timer; import org.apache.hadoop.conf.Configuration; -import java.io.IOException; import java.util.List; import java.util.Map; import java.util.stream.Collectors; public class HoodieJavaWriteClient extends - AbstractHoodieWriteClient>, List, List> { + BaseHoodieWriteClient>, List, List> { public HoodieJavaWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) { super(context, clientConfig); @@ -210,21 +209,20 @@ protected List postWrite(HoodieWriteMetadata> res @Override public void commitCompaction(String compactionInstantTime, - List writeStatuses, - Option> extraMetadata) throws IOException { + HoodieCommitMetadata metadata, + Option> extraMetadata) { throw new HoodieNotSupportedException("CommitCompaction is not supported in HoodieJavaClient"); } @Override protected void completeCompaction(HoodieCommitMetadata metadata, - List writeStatuses, HoodieTable>, List, List> table, String compactionCommitTime) { throw new HoodieNotSupportedException("CompleteCompaction is not supported in HoodieJavaClient"); } @Override - protected List compact(String compactionInstantTime, + protected HoodieWriteMetadata> compact(String compactionInstantTime, boolean shouldComplete) { throw new HoodieNotSupportedException("Compact is not supported in HoodieJavaClient"); } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java index a7dc4a3c0fa73..7d7609f0fa0a9 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/JavaExecutionStrategy.java @@ -24,8 +24,11 @@ import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.JavaTaskContextSupplier; +import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.ClusteringOperation; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -70,7 +73,7 @@ * Clustering strategy for Java engine. */ public abstract class JavaExecutionStrategy> - extends ClusteringExecutionStrategy>, List, List> { + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { private static final Logger LOG = LogManager.getLogger(JavaExecutionStrategy.class); @@ -80,7 +83,7 @@ public JavaExecutionStrategy( } @Override - public HoodieWriteMetadata> performClustering( + public HoodieWriteMetadata> performClustering( HoodieClusteringPlan clusteringPlan, Schema schema, String instantTime) { // execute clustering for each group and collect WriteStatus List writeStatusList = new ArrayList<>(); @@ -89,8 +92,8 @@ public HoodieWriteMetadata> performClustering( inputGroup, clusteringPlan.getStrategy().getStrategyParams(), Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false), instantTime))); - HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); - writeMetadata.setWriteStatuses(writeStatusList); + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieList.of(writeStatusList)); return writeMetadata; } @@ -120,7 +123,7 @@ public abstract List performClusteringWithRecordList( * @param schema Schema of the data including metadata fields. * @return empty for now. */ - protected Option> getPartitioner(Map strategyParams, Schema schema) { + protected Option>>> getPartitioner(Map strategyParams, Schema schema) { if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { return Option.of(new JavaCustomColumnsSortPartitioner( strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()).split(","), @@ -237,7 +240,7 @@ private HoodieRecord transform(IndexedRecord indexedRecord) { HoodieKey hoodieKey = new HoodieKey(key, partition); HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); - HoodieRecord hoodieRecord = new HoodieRecord(hoodieKey, avroPayload); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); return hoodieRecord; } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java index 7f8b83f5c7d5d..dd64859cad7e5 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndex.java @@ -25,7 +25,6 @@ import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; @@ -33,8 +32,9 @@ import org.apache.hudi.table.HoodieTable; import java.util.List; +import java.util.stream.Collectors; -public abstract class JavaHoodieIndex extends HoodieIndex>, List, List> { +public abstract class JavaHoodieIndex extends HoodieIndex>, List> { protected JavaHoodieIndex(HoodieWriteConfig config) { super(config); } @@ -44,21 +44,22 @@ protected JavaHoodieIndex(HoodieWriteConfig config) { @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List updateLocation(List writeStatuses, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; @Override @Deprecated @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract List> tagLocation(List> records, HoodieEngineContext context, - HoodieTable>, List, List> hoodieTable) throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; @Override @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) throws HoodieIndexException { - return HoodieList.of(tagLocation(HoodieList.getList(records), context, hoodieTable)); + List> hoodieRecords = tagLocation(HoodieList.getList(records.map(record -> (HoodieRecord) record)), context, hoodieTable); + return HoodieList.of(hoodieRecords.stream().map(r -> (HoodieRecord) r).collect(Collectors.toList())); } @Override diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java index f6135fb132afa..9f4adad8ecf8a 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/index/JavaHoodieIndexFactory.java @@ -44,9 +44,9 @@ public static HoodieIndex createIndex(HoodieWriteConfig config) { // TODO more indexes to be added switch (config.getIndexType()) { case INMEMORY: - return new HoodieInMemoryHashIndex<>(config); + return new HoodieInMemoryHashIndex(config); case BLOOM: - return new HoodieBloomIndex<>(config, ListBasedHoodieBloomIndexHelper.getInstance()); + return new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance()); default: throw new HoodieIndexException("Unsupported index type " + config.getIndexType()); } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java index 62a6980d509ab..447ed3e96cd9e 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/HoodieJavaCopyOnWriteTable.java @@ -23,6 +23,7 @@ import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; @@ -48,7 +49,7 @@ import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata; import org.apache.hudi.table.action.clean.CleanActionExecutor; import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; -import org.apache.hudi.table.action.cluster.JavaClusteringPlanActionExecutor; +import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; import org.apache.hudi.table.action.cluster.JavaExecuteClusteringCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaBulkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.JavaBulkInsertPreppedCommitActionExecutor; @@ -63,15 +64,17 @@ import org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor; import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.rollback.RestorePlanActionExecutor; import org.apache.hudi.table.action.savepoint.SavepointActionExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nonnull; + import java.io.IOException; import java.util.Collections; import java.util.Iterator; -import javax.annotation.Nonnull; import java.util.List; import java.util.Map; @@ -190,7 +193,7 @@ public HoodieWriteMetadata> compact(HoodieEngineContext contex @Override public Option scheduleClustering(final HoodieEngineContext context, final String instantTime, final Option> extraMetadata) { - return new JavaClusteringPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); + return new ClusteringPlanActionExecutor<>(context, config, this, instantTime, extraMetadata).execute(); } @Override @@ -247,6 +250,11 @@ public HoodieSavepointMetadata savepoint(HoodieEngineContext context, context, config, this, instantToSavepoint, user, comment).execute(); } + @Override + public Option scheduleRestore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { + return new RestorePlanActionExecutor(context, config, this, restoreInstantTime, instantToRestore).execute(); + } + @Override public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java index 83364bdc3ad35..168d558143bd3 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/cluster/JavaExecuteClusteringCommitActionExecutor.java @@ -19,46 +19,32 @@ package org.apache.hudi.table.action.cluster; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieClusteringGroup; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieFileGroupId; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.ClusteringUtils; -import org.apache.hudi.common.util.CommitUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; import org.apache.hudi.table.action.commit.BaseJavaCommitActionExecutor; -import org.apache.avro.Schema; - import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; public class JavaExecuteClusteringCommitActionExecutor> extends BaseJavaCommitActionExecutor { private final HoodieClusteringPlan clusteringPlan; - public JavaExecuteClusteringCommitActionExecutor( - HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime) { + public JavaExecuteClusteringCommitActionExecutor(HoodieEngineContext context, + HoodieWriteConfig config, + HoodieTable table, + String instantTime) { super(context, config, table, instantTime, WriteOperationType.CLUSTER); this.clusteringPlan = ClusteringUtils.getClusteringPlan( table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime)) @@ -68,56 +54,13 @@ public JavaExecuteClusteringCommitActionExecutor( @Override public HoodieWriteMetadata> execute() { - HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); - // Mark instant as clustering inflight - table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); - table.getMetaClient().reloadActiveTimeline(); - - final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); - HoodieWriteMetadata> writeMetadata = ( - (ClusteringExecutionStrategy>, List, List>) - ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(), - new Class[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config)) - .performClustering(clusteringPlan, schema, instantTime); - List writeStatusList = writeMetadata.getWriteStatuses(); - List statuses = updateIndex(writeStatusList, writeMetadata); - writeMetadata.setWriteStats(statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList())); - writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(writeMetadata)); - validateWriteResult(writeMetadata); - commitOnAutoCommit(writeMetadata); - if (!writeMetadata.getCommitMetadata().isPresent()) { - HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); - writeMetadata.setCommitMetadata(Option.of(commitMetadata)); - } - return writeMetadata; - } - - /** - * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written. - * But we can extend this to add more validation. E.g. number of records read = number of records written etc. - * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions. - */ - private void validateWriteResult(HoodieWriteMetadata> writeMetadata) { - if (writeMetadata.getWriteStatuses().isEmpty()) { - throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime - + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " - + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() - + " write statuses"); - } + HoodieWriteMetadata> writeMetadata = executeClustering(clusteringPlan); + List transformedWriteStatuses = writeMetadata.getWriteStatuses().collectAsList(); + return writeMetadata.clone(transformedWriteStatuses); } @Override protected String getCommitActionType() { return HoodieTimeline.REPLACE_COMMIT_ACTION; } - - @Override - protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeMetadata) { - Set newFilesWritten = writeMetadata.getWriteStats().get().stream() - .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); - return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) - .filter(fg -> !newFilesWritten.contains(fg)) - .collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList()))); - } } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java index 2a93c5012ce1e..dc6994d315f02 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/BaseJavaCommitActionExecutor.java @@ -42,6 +42,7 @@ import org.apache.hudi.io.CreateHandleFactory; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieSortedMergeHandle; +import org.apache.hudi.io.HoodieConcatHandle; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; import org.apache.hudi.table.WorkloadStat; @@ -90,27 +91,27 @@ public BaseJavaCommitActionExecutor(HoodieEngineContext context, public HoodieWriteMetadata> execute(List> inputRecords) { HoodieWriteMetadata> result = new HoodieWriteMetadata<>(); - WorkloadProfile profile = null; + WorkloadProfile workloadProfile = null; if (isWorkloadProfileNeeded()) { - profile = new WorkloadProfile(buildProfile(inputRecords)); - LOG.info("Workload profile :" + profile); + workloadProfile = new WorkloadProfile(buildProfile(inputRecords), table.getIndex().canIndexLogFiles()); + LOG.info("Input workload profile :" + workloadProfile); + } + + final Partitioner partitioner = getPartitioner(workloadProfile); + try { + saveWorkloadProfileMetadataToInflight(workloadProfile, instantTime); + } catch (Exception e) { + HoodieTableMetaClient metaClient = table.getMetaClient(); + HoodieInstant inflightInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, metaClient.getCommitActionType(), instantTime); try { - saveWorkloadProfileMetadataToInflight(profile, instantTime); - } catch (Exception e) { - HoodieTableMetaClient metaClient = table.getMetaClient(); - HoodieInstant inflightInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, metaClient.getCommitActionType(), instantTime); - try { - if (!metaClient.getFs().exists(new Path(metaClient.getMetaPath(), inflightInstant.getFileName()))) { - throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", e); - } - } catch (IOException ex) { - LOG.error("Check file exists failed"); - throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", ex); + if (!metaClient.getFs().exists(new Path(metaClient.getMetaPath(), inflightInstant.getFileName()))) { + throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", e); } + } catch (IOException ex) { + LOG.error("Check file exists failed"); + throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", ex); } } - - final Partitioner partitioner = getPartitioner(profile); Map>> partitionedRecords = partition(inputRecords, partitioner); List writeStatuses = new LinkedList<>(); @@ -196,6 +197,11 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta commit(extraMetadata, result, result.getWriteStatuses().stream().map(WriteStatus::getStat).collect(Collectors.toList())); } + protected void setCommitMetadata(HoodieWriteMetadata> result) { + result.setCommitMetadata(Option.of(CommitUtils.buildMetadata(result.getWriteStatuses().stream().map(WriteStatus::getStat).collect(Collectors.toList()), + result.getPartitionToReplaceFileIds(), extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()))); + } + protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List writeStats) { String actionType = getCommitActionType(); LOG.info("Committing " + instantTime + ", action Type " + actionType); @@ -206,8 +212,7 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta try { LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType()); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + HoodieCommitMetadata metadata = result.getCommitMetadata().get(); writeTableMetadata(metadata, actionType); @@ -289,6 +294,8 @@ protected Iterator> handleUpdateInternal(HoodieMergeHandle> recordItr) { if (table.requireSortedRecords()) { return new HoodieSortedMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty()); + } else if (!WriteOperationType.isChangingRecords(operationType) && config.allowDuplicateInserts()) { + return new HoodieConcatHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty()); } else { return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier, Option.empty()); } diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java index cdfa303cd738e..de7afdf00ebeb 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java @@ -39,12 +39,12 @@ import java.util.List; /** - * A java implementation of {@link AbstractBulkInsertHelper}. + * A java implementation of {@link BaseBulkInsertHelper}. * * @param */ @SuppressWarnings("checkstyle:LineLength") -public class JavaBulkInsertHelper extends AbstractBulkInsertHelper>, +public class JavaBulkInsertHelper extends BaseBulkInsertHelper>, List, List, R> { private JavaBulkInsertHelper() { @@ -65,7 +65,7 @@ public HoodieWriteMetadata> bulkInsert(final List>, List, List, R> executor, final boolean performDedupe, - final Option> userDefinedBulkInsertPartitioner) { + final Option>>> userDefinedBulkInsertPartitioner) { HoodieWriteMetadata result = new HoodieWriteMetadata(); // It's possible the transition to inflight could have already happened. @@ -89,7 +89,7 @@ public List bulkInsert(List> inputRecords, HoodieTable>, List, List> table, HoodieWriteConfig config, boolean performDedupe, - Option> userDefinedBulkInsertPartitioner, + Option>>> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) { diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertPreppedCommitActionExecutor.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertPreppedCommitActionExecutor.java index 37b56b6325bc3..ed72fbe7850e2 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertPreppedCommitActionExecutor.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertPreppedCommitActionExecutor.java @@ -26,9 +26,8 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.BulkInsertPartitioner; - +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import java.util.List; @@ -37,12 +36,12 @@ public class JavaBulkInsertPreppedCommitActionExecutor { private final List> preppedInputRecord; - private final Option> userDefinedBulkInsertPartitioner; + private final Option>>> userDefinedBulkInsertPartitioner; public JavaBulkInsertPreppedCommitActionExecutor(HoodieJavaEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, List> preppedInputRecord, - Option> userDefinedBulkInsertPartitioner) { + Option>>> userDefinedBulkInsertPartitioner) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT); this.preppedInputRecord = preppedInputRecord; this.userDefinedBulkInsertPartitioner = userDefinedBulkInsertPartitioner; @@ -60,4 +59,4 @@ public HoodieWriteMetadata> execute() { throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e); } } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java index fc81b787f4737..f82c1c561b2c5 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaDeleteHelper.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.collection.Pair; @@ -43,7 +44,7 @@ @SuppressWarnings("checkstyle:LineLength") public class JavaDeleteHelper extends - AbstractDeleteHelper>, List, List, R> { + BaseDeleteHelper>, List, List, R> { private JavaDeleteHelper() { } @@ -95,7 +96,7 @@ public HoodieWriteMetadata> execute(String instantTime, } List> dedupedRecords = - dedupedKeys.stream().map(key -> new HoodieRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); + dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList()); Instant beginTag = Instant.now(); // perform index look up to get existing location of records List> taggedRecords = HoodieList.getList( diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java index a55121472310d..7878d857761ea 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java @@ -44,7 +44,7 @@ import java.util.Iterator; import java.util.List; -public class JavaMergeHelper extends AbstractMergeHelper>, +public class JavaMergeHelper extends BaseMergeHelper>, List, List> { private JavaMergeHelper() { @@ -91,7 +91,7 @@ public void runMerge(HoodieTable>, List, List ThreadLocal encoderCache = new ThreadLocal<>(); ThreadLocal decoderCache = new ThreadLocal<>(); - wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), + wrapper = new BoundedInMemoryExecutor<>(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> { if (!externalSchemaTransformation) { return record; diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPartitioner.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPartitioner.java index 33f59f4406f39..deaf934cf5d03 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPartitioner.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaUpsertPartitioner.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.NumericUtils; @@ -64,9 +65,9 @@ public class JavaUpsertPartitioner> implements */ private int totalBuckets = 0; /** - * Stat for the current workload. Helps in determining inserts, upserts etc. + * Stat for the input and output workload. Describe the workload before and after being assigned buckets. */ - private WorkloadProfile profile; + private WorkloadProfile workloadProfile; /** * Helps decide which bucket an incoming update should go to. */ @@ -84,16 +85,16 @@ public class JavaUpsertPartitioner> implements protected final HoodieWriteConfig config; - public JavaUpsertPartitioner(WorkloadProfile profile, HoodieEngineContext context, HoodieTable table, + public JavaUpsertPartitioner(WorkloadProfile workloadProfile, HoodieEngineContext context, HoodieTable table, HoodieWriteConfig config) { updateLocationToBucket = new HashMap<>(); partitionPathToInsertBucketInfos = new HashMap<>(); bucketInfoMap = new HashMap<>(); - this.profile = profile; + this.workloadProfile = workloadProfile; this.table = table; this.config = config; - assignUpdates(profile); - assignInserts(profile, context); + assignUpdates(workloadProfile); + assignInserts(workloadProfile, context); LOG.info("Total Buckets :" + totalBuckets + ", buckets info => " + bucketInfoMap + ", \n" + "Partition to insert buckets => " + partitionPathToInsertBucketInfos + ", \n" @@ -102,11 +103,19 @@ public JavaUpsertPartitioner(WorkloadProfile profile, HoodieEngineContext contex private void assignUpdates(WorkloadProfile profile) { // each update location gets a partition - Set> partitionStatEntries = profile.getPartitionPathStatMap().entrySet(); + Set> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet(); for (Map.Entry partitionStat : partitionStatEntries) { + WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat()); for (Map.Entry> updateLocEntry : partitionStat.getValue().getUpdateLocationToCount().entrySet()) { addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey()); + if (profile.hasOutputWorkLoadStats()) { + HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey()); + outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue()); + } + } + if (profile.hasOutputWorkLoadStats()) { + profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats); } } } @@ -133,9 +142,10 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) for (String partitionPath : partitionPaths) { WorkloadStat pStat = profile.getWorkloadStat(partitionPath); + WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat()); if (pStat.getNumInserts() > 0) { - List smallFiles = partitionSmallFilesMap.get(partitionPath); + List smallFiles = partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>()); this.smallFiles.addAll(smallFiles); LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles); @@ -158,6 +168,9 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId()); LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); } + if (profile.hasOutputWorkLoadStats()) { + outputWorkloadStats.addInserts(smallFile.location, recordsToAppend); + } bucketNumbers.add(bucket); recordsPerBucket.add(recordsToAppend); totalUnassignedInserts -= recordsToAppend; @@ -183,6 +196,9 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) } BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath); bucketInfoMap.put(totalBuckets, bucketInfo); + if (profile.hasOutputWorkLoadStats()) { + outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1)); + } totalBuckets++; } } @@ -200,11 +216,19 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets); } + if (profile.hasOutputWorkLoadStats()) { + profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats); + } } } private Map> getSmallFilesForPartitions(List partitionPaths, HoodieEngineContext context) { Map> partitionSmallFilesMap = new HashMap<>(); + + if (config.getParquetSmallFileLimit() <= 0) { + return partitionSmallFilesMap; + } + if (partitionPaths != null && partitionPaths.size() > 0) { context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions"); partitionSmallFilesMap = context.mapToPair(partitionPaths, @@ -266,7 +290,7 @@ public int getPartition(Object key) { String partitionPath = keyLocation.getLeft().getPartitionPath(); List targetBuckets = partitionPathToInsertBucketInfos.get(partitionPath); // pick the target bucket to use based on the weights. - final long totalInserts = Math.max(1, profile.getWorkloadStat(partitionPath).getNumInserts()); + final long totalInserts = Math.max(1, workloadProfile.getWorkloadStat(partitionPath).getNumInserts()); final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", keyLocation.getLeft().getRecordKey()); final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts; diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java index 8af7707ea2f98..3a1fa4b884fd0 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaWriteHelper.java @@ -21,6 +21,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.data.HoodieList; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -33,7 +34,7 @@ import java.util.Objects; import java.util.stream.Collectors; -public class JavaWriteHelper extends AbstractWriteHelper>, +public class JavaWriteHelper extends BaseWriteHelper>, List, List, R> { private JavaWriteHelper() { @@ -55,7 +56,7 @@ protected List> tag(List> dedupedRecords, Hoodie @Override public List> deduplicateRecords( - List> records, HoodieIndex index, int parallelism) { + List> records, HoodieIndex index, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); Map>>> keyedRecords = records.stream().map(record -> { HoodieKey hoodieKey = record.getKey(); @@ -70,7 +71,7 @@ public List> deduplicateRecords( // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. - return new HoodieRecord(rec1.getKey(), reducedData); + return new HoodieAvroRecord(rec1.getKey(), reducedData); }).orElse(null)).filter(Objects::nonNull).collect(Collectors.toList()); } } diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestHoodieConcatHandle.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestHoodieConcatHandle.java new file mode 100644 index 0000000000000..d81b76b0f4577 --- /dev/null +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestHoodieConcatHandle.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.commit; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.engine.EngineType; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; +import org.apache.hudi.testutils.HoodieJavaClientTestBase; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime; +import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieConcatHandle extends HoodieJavaClientTestBase { + private static final Schema SCHEMA = getSchemaFromResource(TestJavaCopyOnWriteActionExecutor.class, "/exampleSchema.avsc"); + + private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() { + return makeHoodieClientConfigBuilder(SCHEMA.toString()); + } + + private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder(String schema) { + // Prepare the AvroParquetIO + return HoodieWriteConfig.newBuilder() + .withEngineType(EngineType.JAVA) + .withPath(basePath) + .withSchema(schema); + } + + private FileStatus[] getIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull) + throws Exception { + // initialize parquet input format + HoodieParquetInputFormat hoodieInputFormat = new HoodieParquetInputFormat(); + JobConf jobConf = new JobConf(hadoopConf); + hoodieInputFormat.setConf(jobConf); + HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE); + setupIncremental(jobConf, startCommitTime, numCommitsToPull); + FileInputFormat.setInputPaths(jobConf, Paths.get(basePath, partitionPath).toString()); + return hoodieInputFormat.listStatus(jobConf); + } + + private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) { + String modePropertyName = + String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE); + + String startCommitTimestampName = + String.format(HoodieHiveUtils.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.set(startCommitTimestampName, startCommit); + + String maxCommitPulls = + String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME); + jobConf.setInt(maxCommitPulls, numberOfCommitsToPull); + } + + @Test + public void testInsert() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMergeAllowDuplicateOnInserts(true).build(); + + HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); + metaClient = HoodieTableMetaClient.reload(metaClient); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + // Get some records belong to the same partition (2021/09/11) + String insertRecordStr1 = "{\"_row_key\":\"1\"," + + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":1}"; + String insertRecordStr2 = "{\"_row_key\":\"2\"," + + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":2}"; + List records1 = new ArrayList<>(); + RawTripTestPayload insertRow1 = new RawTripTestPayload(insertRecordStr1); + RawTripTestPayload insertRow2 = new RawTripTestPayload(insertRecordStr2); + records1.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1)); + records1.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2)); + + int startInstant = 1; + String firstCommitTime = makeNewCommitTime(startInstant++); + // First insert + writeClient.startCommitWithTime(firstCommitTime); + writeClient.insert(records1, firstCommitTime); + + String partitionPath = "2021/09/11"; + FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path filePath = allFiles[0].getPath(); + BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath); + for (HoodieRecord record : records1) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + + insertRecordStr1 = "{\"_row_key\":\"1\"," + + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":3}"; + insertRecordStr2 = "{\"_row_key\":\"2\"," + + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":4}"; + + List records2 = new ArrayList<>(); + insertRow1 = new RawTripTestPayload(insertRecordStr1); + insertRow2 = new RawTripTestPayload(insertRecordStr2); + // The recordKey of records2 and records1 are the same, but the values of other fields are different + records2.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1)); + records2.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2)); + + String newCommitTime = makeNewCommitTime(startInstant++); + writeClient.startCommitWithTime(newCommitTime); + // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts + writeClient.insert(records2, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1); + assertEquals(1, allFiles.length); + // verify new incremental file group is same as the previous one + assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName())); + + filePath = allFiles[0].getPath(); + // The final result should be a collection of records1 and records2 + records1.addAll(records2); + + // Read the base file, check the record content + List fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + int index = 0; + for (GenericRecord record : fileRecords) { + assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString()); + assertEquals(index + 1, record.get("number")); + index++; + } + } + + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testInsertWithDataGenerator(boolean mergeAllowDuplicateOnInsertsEnable) throws Exception { + HoodieWriteConfig config = makeHoodieClientConfigBuilder(TRIP_EXAMPLE_SCHEMA) + .withMergeAllowDuplicateOnInserts(mergeAllowDuplicateOnInsertsEnable).build(); + + HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); + metaClient = HoodieTableMetaClient.reload(metaClient); + BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); + + String partitionPath = "2021/09/11"; + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{partitionPath}); + + int startInstant = 1; + String firstCommitTime = makeNewCommitTime(startInstant++); + List records1 = dataGenerator.generateInserts(firstCommitTime, 100); + + // First insert + writeClient.startCommitWithTime(firstCommitTime); + writeClient.insert(records1, firstCommitTime); + + FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1); + assertEquals(1, allFiles.length); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path filePath = allFiles[0].getPath(); + BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath); + for (HoodieRecord record : records1) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + + String newCommitTime = makeNewCommitTime(startInstant++); + List records2 = dataGenerator.generateUpdates(newCommitTime, 100); + writeClient.startCommitWithTime(newCommitTime); + // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts + writeClient.insert(records2, newCommitTime); + + allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1); + assertEquals(1, allFiles.length); + // verify new incremental file group is same as the previous one + assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName())); + + filePath = allFiles[0].getPath(); + // If mergeAllowDuplicateOnInsertsEnable is true, the final result should be a collection of records1 and records2 + records1.addAll(records2); + + // Read the base file, check the record content + List fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath); + assertEquals(fileRecords.size(), mergeAllowDuplicateOnInsertsEnable ? records1.size() : records2.size()); + + int index = 0; + for (GenericRecord record : fileRecords) { + assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString()); + index++; + } + } +} diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java index 796d7b74a83c5..793b26703011e 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/table/action/commit/TestJavaCopyOnWriteActionExecutor.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.engine.EngineType; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -42,6 +43,8 @@ import org.apache.hudi.table.HoodieJavaCopyOnWriteTable; import org.apache.hudi.table.HoodieJavaTable; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieJavaClientTestBase; +import org.apache.hudi.testutils.MetadataMergeWriteStatus; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; @@ -49,8 +52,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.JobConf; -import org.apache.hudi.testutils.HoodieJavaClientTestBase; -import org.apache.hudi.testutils.MetadataMergeWriteStatus; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.parquet.avro.AvroReadSupport; @@ -121,14 +122,14 @@ private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() { public void testUpdateRecords() throws Exception { // Prepare the AvroParquetIO HoodieWriteConfig config = makeHoodieClientConfig(); - String firstCommitTime = makeNewCommitTime(); + int startInstant = 1; + String firstCommitTime = makeNewCommitTime(startInstant++); HoodieJavaWriteClient writeClient = getHoodieWriteClient(config); writeClient.startCommitWithTime(firstCommitTime); metaClient = HoodieTableMetaClient.reload(metaClient); BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient); String partitionPath = "2016/01/31"; - HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient); // Get some records belong to the same partition (2016/01/31) String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," @@ -142,14 +143,13 @@ public void testUpdateRecords() throws Exception { List records = new ArrayList<>(); RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); - records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); // Insert new records - final HoodieJavaCopyOnWriteTable cowTable = table; writeClient.insert(records, firstCommitTime); FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1); @@ -176,17 +176,16 @@ public void testUpdateRecords() throws Exception { String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1); - HoodieRecord updatedRecord1 = new HoodieRecord( + HoodieRecord updatedRecord1 = new HoodieAvroRecord( new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord insertedRecord1 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); - Thread.sleep(1000); - String newCommitTime = makeNewCommitTime(); + String newCommitTime = makeNewCommitTime(startInstant++); metaClient = HoodieTableMetaClient.reload(metaClient); writeClient.startCommitWithTime(newCommitTime); List statuses = writeClient.upsert(updatedRecords, newCommitTime); @@ -197,9 +196,9 @@ public void testUpdateRecords() throws Exception { assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName())); // Check whether the record has been updated - Path updatedfilePath = allFiles[0].getPath(); + Path updatedFilePath = allFiles[0].getPath(); BloomFilter updatedFilter = - fileUtils.readBloomFilterFromMetadata(hadoopConf, updatedfilePath); + fileUtils.readBloomFilterFromMetadata(hadoopConf, updatedFilePath); for (HoodieRecord record : records) { // No change to the _row_key assertTrue(updatedFilter.mightContain(record.getRecordKey())); @@ -208,7 +207,7 @@ public void testUpdateRecords() throws Exception { assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey())); records.add(insertedRecord1);// add this so it can further check below - ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedfilePath).build(); + ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build(); index = 0; while ((newRecord = (GenericRecord) updatedReader.read()) != null) { assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey()); @@ -256,7 +255,7 @@ private List newHoodieRecords(int n, String time) throws Exception String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i); RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); - records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } return records; } @@ -282,11 +281,11 @@ public void testMetadataAggregateFromWriteStatus() throws Exception { List records = new ArrayList<>(); RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); - records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); // Insert new records BaseJavaCommitActionExecutor actionExecutor = new JavaInsertCommitActionExecutor(context, config, table, @@ -384,7 +383,7 @@ public void testFileSizeUpsertRecords() throws Exception { String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); - records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } // Insert new records diff --git a/hudi-client/hudi-spark-client/pom.xml b/hudi-client/hudi-spark-client/pom.xml index e4a8fd56b6a65..d6c60cb61bc45 100644 --- a/hudi-client/hudi-spark-client/pom.xml +++ b/hudi-client/hudi-spark-client/pom.xml @@ -169,9 +169,9 @@ - org.awaitility - awaitility - test + org.awaitility + awaitility + test diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncClusteringService.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncClusteringService.java index ce436ba034d98..8f6535b11d9b3 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncClusteringService.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncClusteringService.java @@ -19,8 +19,8 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractClusteringClient; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseClusterer; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.client.HoodieSparkClusteringClient; /** @@ -28,12 +28,12 @@ */ public class SparkAsyncClusteringService extends AsyncClusteringService { - public SparkAsyncClusteringService(AbstractHoodieWriteClient writeClient) { + public SparkAsyncClusteringService(BaseHoodieWriteClient writeClient) { super(writeClient); } @Override - protected AbstractClusteringClient createClusteringClient(AbstractHoodieWriteClient client) { + protected BaseClusterer createClusteringClient(BaseHoodieWriteClient client) { return new HoodieSparkClusteringClient(client); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncCompactService.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncCompactService.java index 5235a3cd0a15d..d54fe386bd06b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncCompactService.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/async/SparkAsyncCompactService.java @@ -18,19 +18,19 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractCompactor; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseCompactor; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.client.HoodieSparkCompactor; import org.apache.hudi.common.engine.HoodieEngineContext; public class SparkAsyncCompactService extends AsyncCompactService { - public SparkAsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client) { + public SparkAsyncCompactService(HoodieEngineContext context, BaseHoodieWriteClient client) { super(context, client); } @Override - protected AbstractCompactor createCompactor(AbstractHoodieWriteClient client) { + protected BaseCompactor createCompactor(BaseHoodieWriteClient client) { return new HoodieSparkCompactor(client, this.context); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java index 84040f906ce32..e9bdc427e8356 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieReadClient.java @@ -18,9 +18,9 @@ package org.apache.hudi.client; -import org.apache.hadoop.conf.Configuration; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieKey; @@ -39,6 +39,7 @@ import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; +import org.apache.hadoop.conf.Configuration; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -64,9 +65,9 @@ public class HoodieReadClient> implements Seria /** * TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple - * basepath pointing to the table. Until, then just always assume a BloomIndex + * base path pointing to the table. Until, then just always assume a BloomIndex */ - private final transient HoodieIndex index; + private final transient HoodieIndex index; private HoodieTable>, JavaRDD, JavaRDD> hoodieTable; private transient Option sqlContextOpt; private final transient HoodieSparkEngineContext context; @@ -172,7 +173,7 @@ public Dataset readROView(JavaRDD hoodieKeys, int parallelism) { */ public JavaPairRDD>> checkExists(JavaRDD hoodieKeys) { return HoodieJavaRDD.getJavaRDD( - index.tagLocation(HoodieJavaRDD.of(hoodieKeys.map(k -> new HoodieRecord<>(k, null))), + index.tagLocation(HoodieJavaRDD.of(hoodieKeys.map(k -> new HoodieAvroRecord<>(k, null))), context, hoodieTable)) .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java index 16e54a21551c6..0812b366aadac 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkClusteringClient.java @@ -38,12 +38,12 @@ * Async clustering client for Spark datasource. */ public class HoodieSparkClusteringClient extends - AbstractClusteringClient>, JavaRDD, JavaRDD> { + BaseClusterer>, JavaRDD, JavaRDD> { private static final Logger LOG = LogManager.getLogger(HoodieSparkClusteringClient.class); public HoodieSparkClusteringClient( - AbstractHoodieWriteClient>, JavaRDD, JavaRDD> clusteringClient) { + BaseHoodieWriteClient>, JavaRDD, JavaRDD> clusteringClient) { super(clusteringClient); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkCompactor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkCompactor.java index 60a064ab32bf9..b3dc27b6fc65b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkCompactor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/HoodieSparkCompactor.java @@ -22,33 +22,36 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; -import java.io.IOException; +import java.util.List; -public class HoodieSparkCompactor extends AbstractCompactor extends BaseCompactor>, JavaRDD, JavaRDD> { private static final Logger LOG = LogManager.getLogger(HoodieSparkCompactor.class); private transient HoodieEngineContext context; - public HoodieSparkCompactor(AbstractHoodieWriteClient>, JavaRDD, JavaRDD> compactionClient, + public HoodieSparkCompactor(BaseHoodieWriteClient>, JavaRDD, JavaRDD> compactionClient, HoodieEngineContext context) { super(compactionClient); this.context = context; } @Override - public void compact(HoodieInstant instant) throws IOException { + public void compact(HoodieInstant instant) { LOG.info("Compactor executing compaction " + instant); SparkRDDWriteClient writeClient = (SparkRDDWriteClient) compactionClient; - JavaRDD res = writeClient.compact(instant.getTimestamp()); - this.context.setJobStatus(this.getClass().getSimpleName(), "Collect compaction write status"); - long numWriteErrors = res.collect().stream().filter(WriteStatus::hasErrors).count(); + HoodieWriteMetadata> compactionMetadata = writeClient.compact(instant.getTimestamp()); + List writeStats = compactionMetadata.getCommitMetadata().get().getWriteStats(); + long numWriteErrors = writeStats.stream().mapToLong(HoodieWriteStat::getTotalWriteErrors).sum(); if (numWriteErrors != 0) { // We treat even a single error in compaction as fatal LOG.error("Compaction for instant (" + instant + ") failed with write errors. Errors :" + numWriteErrors); @@ -56,6 +59,6 @@ public void compact(HoodieInstant instant) throws IOException { "Compaction for instant (" + instant + ") failed with write errors. Errors :" + numWriteErrors); } // Commit compaction - writeClient.commitCompaction(instant.getTimestamp(), res, Option.empty()); + writeClient.commitCompaction(instant.getTimestamp(), compactionMetadata.getCommitMetadata().get(), Option.empty()); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java index 9b2aad3ebafa1..d51d25616c70d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java @@ -65,7 +65,6 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import java.io.IOException; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.util.List; @@ -74,7 +73,7 @@ @SuppressWarnings("checkstyle:LineLength") public class SparkRDDWriteClient extends - AbstractHoodieWriteClient>, JavaRDD, JavaRDD> { + BaseHoodieWriteClient>, JavaRDD, JavaRDD> { private static final Logger LOG = LogManager.getLogger(SparkRDDWriteClient.class); @@ -286,26 +285,24 @@ protected JavaRDD postWrite(HoodieWriteMetadata writeStatuses, Option> extraMetadata) throws IOException { + public void commitCompaction(String compactionInstantTime, HoodieCommitMetadata metadata, Option> extraMetadata) { HoodieSparkTable table = HoodieSparkTable.create(config, context); - HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata( - table, compactionInstantTime, HoodieJavaRDD.of(writeStatuses), config.getSchema()); extraMetadata.ifPresent(m -> m.forEach(metadata::addMetadata)); - completeCompaction(metadata, writeStatuses, table, compactionInstantTime); + completeCompaction(metadata, table, compactionInstantTime); } @Override - protected void completeCompaction(HoodieCommitMetadata metadata, JavaRDD writeStatuses, + protected void completeCompaction(HoodieCommitMetadata metadata, HoodieTable>, JavaRDD, JavaRDD> table, String compactionCommitTime) { this.context.setJobStatus(this.getClass().getSimpleName(), "Collect compaction write status and commit compaction"); - List writeStats = writeStatuses.map(WriteStatus::getStat).collect(); + List writeStats = metadata.getWriteStats(); final HoodieInstant compactionInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionCommitTime); try { this.txnManager.beginTransaction(Option.of(compactionInstant), Option.empty()); finalizeWrite(table, compactionCommitTime, writeStats); // commit to data table after committing to metadata table. - writeTableMetadataForTableServices(table, metadata, compactionInstant); + updateTableMetadata(table, metadata, compactionInstant); LOG.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata); CompactHelpers.getInstance().completeInflightCompaction(table, compactionCommitTime, metadata); } finally { @@ -327,7 +324,7 @@ protected void completeCompaction(HoodieCommitMetadata metadata, JavaRDD compact(String compactionInstantTime, boolean shouldComplete) { + protected HoodieWriteMetadata> compact(String compactionInstantTime, boolean shouldComplete) { HoodieSparkTable table = HoodieSparkTable.create(config, context, true); preWrite(compactionInstantTime, WriteOperationType.COMPACT, table.getMetaClient()); HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline(); @@ -339,11 +336,10 @@ protected JavaRDD compact(String compactionInstantTime, boolean sho compactionTimer = metrics.getCompactionCtx(); HoodieWriteMetadata> compactionMetadata = table.compact(context, compactionInstantTime); - JavaRDD statuses = compactionMetadata.getWriteStatuses(); if (shouldComplete && compactionMetadata.getCommitMetadata().isPresent()) { - completeTableService(TableServiceType.COMPACT, compactionMetadata.getCommitMetadata().get(), statuses, table, compactionInstantTime); + completeTableService(TableServiceType.COMPACT, compactionMetadata.getCommitMetadata().get(), table, compactionInstantTime); } - return statuses; + return compactionMetadata; } @Override @@ -359,15 +355,14 @@ public HoodieWriteMetadata> cluster(String clusteringInstan clusteringTimer = metrics.getClusteringCtx(); LOG.info("Starting clustering at " + clusteringInstant); HoodieWriteMetadata> clusteringMetadata = table.cluster(context, clusteringInstant); - JavaRDD statuses = clusteringMetadata.getWriteStatuses(); // TODO : Where is shouldComplete used ? if (shouldComplete && clusteringMetadata.getCommitMetadata().isPresent()) { - completeTableService(TableServiceType.CLUSTER, clusteringMetadata.getCommitMetadata().get(), statuses, table, clusteringInstant); + completeTableService(TableServiceType.CLUSTER, clusteringMetadata.getCommitMetadata().get(), table, clusteringInstant); } return clusteringMetadata; } - private void completeClustering(HoodieReplaceCommitMetadata metadata, JavaRDD writeStatuses, + private void completeClustering(HoodieReplaceCommitMetadata metadata, HoodieTable>, JavaRDD, JavaRDD> table, String clusteringCommitTime) { @@ -378,17 +373,20 @@ private void completeClustering(HoodieReplaceCommitMetadata metadata, JavaRDD s.getTotalWriteErrors() > 0L).map(s -> s.getFileId()).collect(Collectors.joining(","))); } + final HoodieInstant clusteringInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringCommitTime); try { this.txnManager.beginTransaction(Option.of(clusteringInstant), Option.empty()); + finalizeWrite(table, clusteringCommitTime, writeStats); - writeTableMetadataForTableServices(table, metadata,clusteringInstant); - // Update outstanding metadata indexes - if (config.isLayoutOptimizationEnabled() - && !config.getClusteringSortColumns().isEmpty()) { - table.updateMetadataIndexes(context, writeStats, clusteringCommitTime); - } + // Update table's metadata (table) + updateTableMetadata(table, metadata, clusteringInstant); + // Update tables' metadata indexes + // NOTE: This overlaps w/ metadata table (above) and will be reconciled in the future + table.updateMetadataIndexes(context, writeStats, clusteringCommitTime); + LOG.info("Committing Clustering " + clusteringCommitTime + ". Finished with result " + metadata); + table.getActiveTimeline().transitionReplaceInflightToComplete( HoodieTimeline.getReplaceCommitInflightInstant(clusteringCommitTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); @@ -412,17 +410,18 @@ private void completeClustering(HoodieReplaceCommitMetadata metadata, JavaRDD>, JavaRDD, JavaRDD> table, HoodieCommitMetadata commitMetadata, - HoodieInstant hoodieInstant) { + private void updateTableMetadata(HoodieTable>, JavaRDD, JavaRDD> table, HoodieCommitMetadata commitMetadata, + HoodieInstant hoodieInstant) { boolean isTableServiceAction = table.isTableServiceAction(hoodieInstant.getAction()); // Do not do any conflict resolution here as we do with regular writes. We take the lock here to ensure all writes to metadata table happens within a // single lock (single writer). Because more than one write to metadata table will result in conflicts since all of them updates the same partition. - table.getMetadataWriter(hoodieInstant.getTimestamp()).ifPresent( - w -> w.update(commitMetadata, hoodieInstant.getTimestamp(), isTableServiceAction)); + table.getMetadataWriter(hoodieInstant.getTimestamp()) + .ifPresent(writer -> writer.update(commitMetadata, hoodieInstant.getTimestamp(), isTableServiceAction)); } @Override - protected HoodieTable>, JavaRDD, JavaRDD> getTableAndInitCtx(WriteOperationType operationType, String instantTime) { + protected HoodieTable>, JavaRDD, JavaRDD> getTableAndInitCtx(WriteOperationType operationType, + String instantTime) { HoodieTableMetaClient metaClient = createMetaClient(true); UpgradeDowngrade upgradeDowngrade = new UpgradeDowngrade( metaClient, config, context, SparkUpgradeDowngradeHelper.getInstance()); @@ -439,8 +438,11 @@ protected HoodieTable>, JavaRDD, JavaRDD inFlightInstantTimestamp) { } // TODO : To enforce priority between table service and ingestion writer, use transactions here and invoke strategy - private void completeTableService(TableServiceType tableServiceType, HoodieCommitMetadata metadata, JavaRDD writeStatuses, + private void completeTableService(TableServiceType tableServiceType, HoodieCommitMetadata metadata, HoodieTable>, JavaRDD, JavaRDD> table, String commitInstant) { switch (tableServiceType) { case CLUSTER: - completeClustering((HoodieReplaceCommitMetadata) metadata, writeStatuses, table, commitInstant); + completeClustering((HoodieReplaceCommitMetadata) metadata, table, commitInstant); break; case COMPACT: - completeCompaction(metadata, writeStatuses, table, commitInstant); + completeCompaction(metadata, table, commitInstant); break; default: throw new IllegalArgumentException("This table service is not valid " + tableServiceType); @@ -497,7 +499,7 @@ private HoodieTable>, JavaRDD, JavaRDD rdd.unpersist()); + // If we do not explicitly release the resource, spark will automatically manage the resource and clean it up automatically + // see: https://spark.apache.org/docs/latest/rdd-programming-guide.html#removing-data + if (config.areReleaseResourceEnabled()) { + ((HoodieSparkEngineContext) context).getJavaSparkContext().getPersistentRDDs().values() + .forEach(JavaRDD::unpersist); + } } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java index b38931c2d93d1..6629569d096b3 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java @@ -87,7 +87,7 @@ protected Stream buildClusteringGroupsForPartition(String // Add to the current file-group currentGroup.add(currentSlice); - // assume each filegroup size is ~= parquet.max.file.size + // assume each file group size is ~= parquet.max.file.size totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : writeConfig.getParquetMaxFileSize(); } @@ -118,7 +118,7 @@ protected Map getStrategyParams() { @Override protected Stream getFileSlicesEligibleForClustering(final String partition) { return super.getFileSlicesEligibleForClustering(partition) - // Only files that have basefile size smaller than small file size are eligible. + // Only files that have base file size smaller than small file size are eligible. .filter(slice -> slice.getBaseFile().map(HoodieBaseFile::getFileSize).orElse(0L) < getWriteConfig().getClusteringSmallFileLimit()); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java index c88b848ddf8cf..91d1f4e4e4fa2 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java @@ -26,8 +26,10 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.utils.ConcatenatingIterator; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.ClusteringOperation; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -39,11 +41,13 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner; -import org.apache.hudi.execution.bulkinsert.RDDSpatialCurveOptimizationSortPartitioner; +import org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner; import org.apache.hudi.io.IOUtils; import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; @@ -80,7 +84,7 @@ * Clustering strategy to submit multiple spark jobs and union the results. */ public abstract class MultipleSparkJobExecutionStrategy> - extends ClusteringExecutionStrategy>, JavaRDD, JavaRDD> { + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { private static final Logger LOG = LogManager.getLogger(MultipleSparkJobExecutionStrategy.class); public MultipleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { @@ -88,7 +92,7 @@ public MultipleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext } @Override - public HoodieWriteMetadata> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { + public HoodieWriteMetadata> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext()); // execute clustering for each group async and collect WriteStatus Stream> writeStatusRDDStream = FutureUtils.allOf( @@ -103,8 +107,8 @@ public HoodieWriteMetadata> performClustering(final HoodieC JavaRDD[] writeStatuses = convertStreamToArray(writeStatusRDDStream); JavaRDD writeStatusRDD = engineContext.union(writeStatuses); - HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); - writeMetadata.setWriteStatuses(writeStatusRDD); + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD)); return writeMetadata; } @@ -133,17 +137,29 @@ public abstract JavaRDD performClusteringWithRecordsRDD(final JavaR * @param schema Schema of the data including metadata fields. * @return {@link RDDCustomColumnsSortPartitioner} if sort columns are provided, otherwise empty. */ - protected Option> getPartitioner(Map strategyParams, Schema schema) { - if (getWriteConfig().isLayoutOptimizationEnabled()) { - // sort input records by z-order/hilbert - return Option.of(new RDDSpatialCurveOptimizationSortPartitioner((HoodieSparkEngineContext) getEngineContext(), - getWriteConfig(), HoodieAvroUtils.addMetadataFields(schema))); - } else if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { - return Option.of(new RDDCustomColumnsSortPartitioner(strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()).split(","), - HoodieAvroUtils.addMetadataFields(schema), getWriteConfig().isConsistentLogicalTimestampEnabled())); - } else { - return Option.empty(); - } + protected Option>>> getPartitioner(Map strategyParams, Schema schema) { + Option orderByColumnsOpt = + Option.ofNullable(strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key())) + .map(listStr -> listStr.split(",")); + + return orderByColumnsOpt.map(orderByColumns -> { + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy = getWriteConfig().getLayoutOptimizationStrategy(); + switch (layoutOptStrategy) { + case ZORDER: + case HILBERT: + return new RDDSpatialCurveSortPartitioner( + (HoodieSparkEngineContext) getEngineContext(), + orderByColumns, + layoutOptStrategy, + getWriteConfig().getLayoutOptimizationCurveBuildMethod(), + HoodieAvroUtils.addMetadataFields(schema)); + case LINEAR: + return new RDDCustomColumnsSortPartitioner(orderByColumns, HoodieAvroUtils.addMetadataFields(schema), + getWriteConfig().isConsistentLogicalTimestampEnabled()); + default: + throw new UnsupportedOperationException(String.format("Layout optimization strategy '%s' is not supported", layoutOptStrategy)); + } + }); } /** @@ -278,7 +294,7 @@ private HoodieRecord transform(IndexedRecord indexedRecord) { HoodieKey hoodieKey = new HoodieKey(key, partition); HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); - HoodieRecord hoodieRecord = new HoodieRecord(hoodieKey, avroPayload); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); return hoodieRecord; } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java index 98bf9151fc9ef..1158d0ada42f0 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SingleSparkJobExecutionStrategy.java @@ -25,10 +25,12 @@ import org.apache.hudi.client.utils.ConcatenatingIterator; import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.model.ClusteringGroupInfo; import org.apache.hudi.common.model.ClusteringOperation; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -36,6 +38,7 @@ import org.apache.hudi.common.model.RewriteAvroPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.storage.HoodieFileReaderFactory; @@ -70,7 +73,7 @@ * MultipleSparkJobExecution strategy is not ideal for use cases that require large number of clustering groups */ public abstract class SingleSparkJobExecutionStrategy> - extends ClusteringExecutionStrategy>, JavaRDD, JavaRDD> { + extends ClusteringExecutionStrategy>, HoodieData, HoodieData> { private static final Logger LOG = LogManager.getLogger(SingleSparkJobExecutionStrategy.class); public SingleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) { @@ -78,7 +81,7 @@ public SingleSparkJobExecutionStrategy(HoodieTable table, HoodieEngineContext en } @Override - public HoodieWriteMetadata> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { + public HoodieWriteMetadata> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) { JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext()); final TaskContextSupplier taskContextSupplier = getEngineContext().getTaskContextSupplier(); final SerializableSchema serializableSchema = new SerializableSchema(schema); @@ -103,8 +106,8 @@ public HoodieWriteMetadata> performClustering(final HoodieC ).iterator(); }); - HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); - writeMetadata.setWriteStatuses(writeStatusRDD); + HoodieWriteMetadata> writeMetadata = new HoodieWriteMetadata<>(); + writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD)); return writeMetadata; } @@ -124,8 +127,8 @@ private Stream runClusteringForGroup(ClusteringGroupInfo clustering Iterator> writeStatuses = performClusteringWithRecordsIterator(inputRecords, clusteringOps.getNumOutputGroups(), instantTime, strategyParams, schema.get(), inputFileIds, preserveHoodieMetadata, taskContextSupplier); - Iterable> writestatusIterable = () -> writeStatuses; - return StreamSupport.stream(writestatusIterable.spliterator(), false) + Iterable> writeStatusIterable = () -> writeStatuses; + return StreamSupport.stream(writeStatusIterable.spliterator(), false) .flatMap(writeStatusList -> writeStatusList.stream()); } @@ -181,7 +184,7 @@ private HoodieRecord transform(IndexedRecord indexedRecord) { HoodieKey hoodieKey = new HoodieKey(key, partition); HoodieRecordPayload avroPayload = new RewriteAvroPayload(record); - HoodieRecord hoodieRecord = new HoodieRecord(hoodieKey, avroPayload); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, avroPayload); return hoodieRecord; } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/BaseSparkUpdateStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/BaseSparkUpdateStrategy.java new file mode 100644 index 0000000000000..655c11983e46b --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/BaseSparkUpdateStrategy.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.clustering.update.strategy; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.table.action.cluster.strategy.UpdateStrategy; + +import org.apache.spark.api.java.JavaRDD; + +import java.util.List; +import java.util.Set; + +/** + * Spark base update strategy, write records to the file groups which are in clustering + * need to check. Spark relate implementations should extend this base class. + */ +public abstract class BaseSparkUpdateStrategy> extends UpdateStrategy>> { + + public BaseSparkUpdateStrategy(HoodieSparkEngineContext engineContext, + Set fileGroupsInPendingClustering) { + super(engineContext, fileGroupsInPendingClustering); + } + + /** + * Get records matched file group ids. + * @param inputRecords the records to write, tagged with target file id + * @return the records matched file group ids + */ + protected List getGroupIdsWithUpdate(JavaRDD> inputRecords) { + return inputRecords + .filter(record -> record.getCurrentLocation() != null) + .map(record -> new HoodieFileGroupId(record.getPartitionPath(), record.getCurrentLocation().getFileId())).distinct().collect(); + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkAllowUpdateStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkAllowUpdateStrategy.java index 403a0c2e1ca87..92a5fb69a7cd9 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkAllowUpdateStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkAllowUpdateStrategy.java @@ -23,7 +23,6 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.table.action.cluster.strategy.UpdateStrategy; import org.apache.spark.api.java.JavaRDD; @@ -35,20 +34,13 @@ /** * Allow ingestion commits during clustering job. */ -public class SparkAllowUpdateStrategy> extends UpdateStrategy>> { +public class SparkAllowUpdateStrategy> extends BaseSparkUpdateStrategy { - public SparkAllowUpdateStrategy( - HoodieSparkEngineContext engineContext, HashSet fileGroupsInPendingClustering) { + public SparkAllowUpdateStrategy(HoodieSparkEngineContext engineContext, + HashSet fileGroupsInPendingClustering) { super(engineContext, fileGroupsInPendingClustering); } - private List getGroupIdsWithUpdate(JavaRDD> inputRecords) { - List fileGroupIdsWithUpdates = inputRecords - .filter(record -> record.getCurrentLocation() != null) - .map(record -> new HoodieFileGroupId(record.getPartitionPath(), record.getCurrentLocation().getFileId())).distinct().collect(); - return fileGroupIdsWithUpdates; - } - @Override public Pair>, Set> handleUpdate(JavaRDD> taggedRecordsRDD) { List fileGroupIdsWithRecordUpdate = getGroupIdsWithUpdate(taggedRecordsRDD); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkRejectUpdateStrategy.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkRejectUpdateStrategy.java index b12d9ad435713..ac058a4d85095 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkRejectUpdateStrategy.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/update/strategy/SparkRejectUpdateStrategy.java @@ -24,7 +24,6 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieClusteringUpdateException; -import org.apache.hudi.table.action.cluster.strategy.UpdateStrategy; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -37,22 +36,16 @@ /** * Update strategy based on following. - * if some file group have update record, throw exception + * if some file groups have update record, throw exception */ -public class SparkRejectUpdateStrategy> extends UpdateStrategy>> { +public class SparkRejectUpdateStrategy> extends BaseSparkUpdateStrategy { private static final Logger LOG = LogManager.getLogger(SparkRejectUpdateStrategy.class); - public SparkRejectUpdateStrategy(HoodieSparkEngineContext engineContext, HashSet fileGroupsInPendingClustering) { + public SparkRejectUpdateStrategy(HoodieSparkEngineContext engineContext, + HashSet fileGroupsInPendingClustering) { super(engineContext, fileGroupsInPendingClustering); } - private List getGroupIdsWithUpdate(JavaRDD> inputRecords) { - List fileGroupIdsWithUpdates = inputRecords - .filter(record -> record.getCurrentLocation() != null) - .map(record -> new HoodieFileGroupId(record.getPartitionPath(), record.getCurrentLocation().getFileId())).distinct().collect(); - return fileGroupIdsWithUpdates; - } - @Override public Pair>, Set> handleUpdate(JavaRDD> taggedRecordsRDD) { List fileGroupIdsWithRecordUpdate = getGroupIdsWithUpdate(taggedRecordsRDD); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java index 604abbd5c0282..9e72390e49f55 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/utils/SparkValidatorUtils.java @@ -31,13 +31,13 @@ import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; -import scala.collection.JavaConverters; import java.util.Arrays; import java.util.HashSet; @@ -47,6 +47,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import scala.collection.JavaConverters; + /** * Spark validator utils to verify and run any precommit validators configured. */ @@ -97,7 +99,7 @@ public static void runValidators(HoodieWriteConfig config, } /** - * Run validators in a separate threadpool for parallelism. Each of validator can submit a distributed spark job if needed. + * Run validators in a separate thread pool for parallelism. Each of validator can submit a distributed spark job if needed. */ private static CompletableFuture runValidatorAsync(SparkPreCommitValidator validator, HoodieWriteMetadata writeMetadata, Dataset beforeState, Dataset afterState, String instantTime) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java index 454638c2daa70..026334fde0cde 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQueryInequalityPreCommitValidator.java @@ -34,11 +34,11 @@ import org.apache.spark.sql.SQLContext; /** - * Validator to run sql query and compare table state + * Validator to run sql query and compare table state * 1) before new commit started. * 2) current inflight commit (if successful). - * - * Expects query results dont match. + *

+ * Expects query results do not match. */ public class SqlQueryInequalityPreCommitValidator> extends SqlQueryPreCommitValidator { private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class); @@ -66,7 +66,7 @@ protected void validateUsingQuery(String query, String prevTableSnapshot, String LOG.info("Completed Inequality Validation, datasets equal? " + areDatasetsEqual); if (areDatasetsEqual) { LOG.error("query validation failed. See stdout for sample query results. Query: " + query); - System.out.println("Expected query results to be inequal, but they are same. Result (sample records only):"); + System.out.println("Expected query results to be different, but they are same. Result (sample records only):"); prevRows.show(); throw new HoodieValidationException("Query validation failed for '" + query + "'. Expected " + prevRows.count() + " rows, Found " + newRows.count()); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java index 631f0e633c025..66e956dc59650 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/validator/SqlQuerySingleResultPreCommitValidator.java @@ -35,9 +35,9 @@ import java.util.List; /** - * Validator to run sql queries on new table state and expects a single result. If the result doesnt match expected result, - * throw validation error. - * + * Validator to run sql queries on new table state and expects a single result. If the result does not match expected result, + * throw validation error. + *

* Example configuration: "query1#expectedResult1;query2#expectedResult2;" */ public class SqlQuerySingleResultPreCommitValidator> extends SqlQueryPreCommitValidator { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java index d4eb25963e5be..1381ea86e481c 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/data/HoodieJavaRDD.java @@ -83,8 +83,8 @@ public JavaRDD get() { } @Override - public void persist(String storageLevel) { - rddData.persist(StorageLevel.fromString(storageLevel)); + public void persist(String cacheConfig) { + rddData.persist(StorageLevel.fromString(cacheConfig)); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkBoundedInMemoryExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkBoundedInMemoryExecutor.java deleted file mode 100644 index d240c065d0834..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkBoundedInMemoryExecutor.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.execution; - -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; -import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; -import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer; -import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer; -import org.apache.hudi.config.HoodieWriteConfig; - -import org.apache.spark.TaskContext; -import org.apache.spark.TaskContext$; - -import java.util.Iterator; -import java.util.function.Function; - -public class SparkBoundedInMemoryExecutor extends BoundedInMemoryExecutor { - - // Need to set current spark thread's TaskContext into newly launched thread so that new thread can access - // TaskContext properties. - final TaskContext sparkThreadTaskContext; - - public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator inputItr, - BoundedInMemoryQueueConsumer consumer, Function bufferedIteratorTransform) { - this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform); - } - - public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, BoundedInMemoryQueueProducer producer, - BoundedInMemoryQueueConsumer consumer, Function bufferedIteratorTransform) { - super(hoodieConfig.getWriteBufferLimitBytes(), producer, Option.of(consumer), bufferedIteratorTransform); - this.sparkThreadTaskContext = TaskContext.get(); - } - - @Override - public void preExecute() { - // Passing parent thread's TaskContext to newly launched thread for it to access original TaskContext properties. - TaskContext$.MODULE$.setTaskContext(sparkThreadTaskContext); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkLazyInsertIterable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkLazyInsertIterable.java index 088872bbd4381..a8a9e49c01c00 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkLazyInsertIterable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/SparkLazyInsertIterable.java @@ -18,7 +18,6 @@ package org.apache.hudi.execution; -import org.apache.avro.Schema; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.engine.TaskContextSupplier; @@ -30,6 +29,8 @@ import org.apache.hudi.io.WriteHandleFactory; import org.apache.hudi.table.HoodieTable; +import org.apache.avro.Schema; + import java.util.Iterator; import java.util.List; @@ -84,8 +85,8 @@ protected List computeNext() { schema = HoodieAvroUtils.addMetadataFields(schema); } bufferedIteratorExecutor = - new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, getInsertHandler(), - getTransformFunction(schema, hoodieConfig)); + new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), inputItr, getInsertHandler(), + getTransformFunction(schema, hoodieConfig), hoodieTable.getPreExecuteRunnable()); final List result = bufferedIteratorExecutor.execute(); assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining(); return result; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java similarity index 51% rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java rename to hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java index ca7dfa3e7f2cd..219fb0b165972 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveSortPartitioner.java @@ -18,100 +18,96 @@ package org.apache.hudi.execution.bulkinsert; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.HoodieSparkUtils; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.RewriteAvroPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieClusteringConfig; -import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.sort.SpaceCurveSortingHelper; import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import java.util.Arrays; import java.util.List; -import java.util.stream.Collectors; - -import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty; /** * A partitioner that does spatial curve optimization sorting based on specified column values for each RDD partition. * support z-curve optimization, hilbert will come soon. * @param HoodieRecordPayload type */ -public class RDDSpatialCurveOptimizationSortPartitioner +public class RDDSpatialCurveSortPartitioner implements BulkInsertPartitioner>> { - private final HoodieSparkEngineContext sparkEngineContext; - private final SerializableSchema serializableSchema; - private final HoodieWriteConfig config; - public RDDSpatialCurveOptimizationSortPartitioner(HoodieSparkEngineContext sparkEngineContext, HoodieWriteConfig config, Schema schema) { + private final HoodieSparkEngineContext sparkEngineContext; + private final String[] orderByColumns; + private final Schema schema; + private final HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy; + private final HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType; + + public RDDSpatialCurveSortPartitioner(HoodieSparkEngineContext sparkEngineContext, + String[] orderByColumns, + HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy, + HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType, + Schema schema) { this.sparkEngineContext = sparkEngineContext; - this.config = config; - this.serializableSchema = new SerializableSchema(schema); + this.orderByColumns = orderByColumns; + this.layoutOptStrategy = layoutOptStrategy; + this.curveCompositionStrategyType = curveCompositionStrategyType; + this.schema = schema; } @Override public JavaRDD> repartitionRecords(JavaRDD> records, int outputSparkPartitions) { - JavaRDD preparedRecord = prepareGenericRecord(records, outputSparkPartitions, serializableSchema.get()); - return preparedRecord.map(record -> { - String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieKey hoodieKey = new HoodieKey(key, partition); - HoodieRecord hoodieRecord = new HoodieRecord(hoodieKey, new RewriteAvroPayload(record)); - return hoodieRecord; - }); - } - - private JavaRDD prepareGenericRecord(JavaRDD> inputRecords, final int numOutputGroups, final Schema schema) { SerializableSchema serializableSchema = new SerializableSchema(schema); - JavaRDD genericRecordJavaRDD = inputRecords.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get()); - Dataset originDF = + JavaRDD genericRecordsRDD = + records.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get()); + + Dataset sourceDataset = AvroConversionUtils.createDataFrame( - genericRecordJavaRDD.rdd(), + genericRecordsRDD.rdd(), schema.toString(), sparkEngineContext.getSqlContext().sparkSession() ); - Dataset sortedDF = reorder(originDF, numOutputGroups); - - return HoodieSparkUtils.createRdd(sortedDF, schema.getName(), - schema.getNamespace(), false, org.apache.hudi.common.util.Option.empty()).toJavaRDD(); + Dataset sortedDataset = reorder(sourceDataset, outputSparkPartitions); + + return HoodieSparkUtils.createRdd(sortedDataset, schema.getName(), schema.getNamespace(), false, Option.empty()) + .toJavaRDD() + .map(record -> { + String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieKey hoodieKey = new HoodieKey(key, partition); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, new RewriteAvroPayload(record)); + return hoodieRecord; + }); } - private Dataset reorder(Dataset originDF, int numOutputGroups) { - String orderedColumnsListConfig = config.getClusteringSortColumns(); - - if (isNullOrEmpty(orderedColumnsListConfig) || numOutputGroups <= 0) { + private Dataset reorder(Dataset dataset, int numOutputGroups) { + if (orderByColumns.length == 0) { // No-op - return originDF; + return dataset; } - List orderedCols = - Arrays.stream(orderedColumnsListConfig.split(",")) - .map(String::trim) - .collect(Collectors.toList()); - - HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy = - HoodieClusteringConfig.LayoutOptimizationStrategy.fromValue(config.getLayoutOptimizationStrategy()); - - HoodieClusteringConfig.BuildCurveStrategyType curveBuildStrategyType = config.getLayoutOptimizationCurveBuildMethod(); + List orderedCols = Arrays.asList(orderByColumns); - switch (curveBuildStrategyType) { + switch (curveCompositionStrategyType) { case DIRECT: - return SpaceCurveSortingHelper.orderDataFrameByMappingValues(originDF, layoutOptStrategy, orderedCols, numOutputGroups); + return SpaceCurveSortingHelper.orderDataFrameByMappingValues(dataset, layoutOptStrategy, orderedCols, numOutputGroups); case SAMPLE: - return SpaceCurveSortingHelper.orderDataFrameBySamplingValues(originDF, layoutOptStrategy, orderedCols, numOutputGroups); + return SpaceCurveSortingHelper.orderDataFrameBySamplingValues(dataset, layoutOptStrategy, orderedCols, numOutputGroups); default: - throw new UnsupportedOperationException(String.format("Unsupported space-curve curve building strategy (%s)", curveBuildStrategyType)); + throw new UnsupportedOperationException(String.format("Unsupported space-curve curve building strategy (%s)", curveCompositionStrategyType)); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java index 62bf5c100a949..aece86a3878ee 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndex.java @@ -24,7 +24,6 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieWriteConfig; @@ -36,7 +35,7 @@ @SuppressWarnings("checkstyle:LineLength") public abstract class SparkHoodieIndex> - extends HoodieIndex>, JavaRDD, JavaRDD> { + extends HoodieIndex>, JavaRDD> { protected SparkHoodieIndex(HoodieWriteConfig config) { super(config); } @@ -46,21 +45,23 @@ protected SparkHoodieIndex(HoodieWriteConfig config) { @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; @Override @Deprecated @PublicAPIMethod(maturity = ApiMaturityLevel.DEPRECATED) public abstract JavaRDD> tagLocation(JavaRDD> records, HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException; + HoodieTable hoodieTable) throws HoodieIndexException; @Override @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) throws HoodieIndexException { - return HoodieJavaRDD.of(tagLocation(HoodieJavaRDD.getJavaRDD(records), context, hoodieTable)); + return HoodieJavaRDD.of(tagLocation( + HoodieJavaRDD.getJavaRDD(records.map(record -> (HoodieRecord) record)), context, hoodieTable) + .map(r -> (HoodieRecord) r)); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java index 69e18714c49c2..d1f40dca484c5 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java @@ -53,19 +53,19 @@ public static HoodieIndex createIndex(HoodieWriteConfig config) { } switch (config.getIndexType()) { case HBASE: - return new SparkHoodieHBaseIndex<>(config); + return new SparkHoodieHBaseIndex(config); case INMEMORY: - return new HoodieInMemoryHashIndex<>(config); + return new HoodieInMemoryHashIndex(config); case BUCKET: return new HoodieBucketIndex(config); case BLOOM: - return new HoodieBloomIndex<>(config, SparkHoodieBloomIndexHelper.getInstance()); + return new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); case GLOBAL_BLOOM: - return new HoodieGlobalBloomIndex<>(config, SparkHoodieBloomIndexHelper.getInstance()); + return new HoodieGlobalBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); case SIMPLE: - return new HoodieSimpleIndex<>(config, getKeyGeneratorForSimpleIndex(config)); + return new HoodieSimpleIndex(config, getKeyGeneratorForSimpleIndex(config)); case GLOBAL_SIMPLE: - return new HoodieGlobalSimpleIndex<>(config, getKeyGeneratorForSimpleIndex(config)); + return new HoodieGlobalSimpleIndex(config, getKeyGeneratorForSimpleIndex(config)); default: throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java index 148203c9b71f1..e19a429ea7234 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndexCheckFunction.java @@ -25,7 +25,7 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.io.HoodieKeyLookupHandle; -import org.apache.hudi.io.HoodieKeyLookupHandle.KeyLookupResult; +import org.apache.hudi.io.HoodieKeyLookupResult; import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.function.Function2; @@ -40,7 +40,7 @@ * Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the actual files. */ public class HoodieBloomIndexCheckFunction - implements Function2>, Iterator>> { + implements Function2>, Iterator>> { private final HoodieTable hoodieTable; @@ -52,12 +52,12 @@ public HoodieBloomIndexCheckFunction(HoodieTable hoodieTable, HoodieWriteConfig } @Override - public Iterator> call(Integer partition, - Iterator> filePartitionRecordKeyTripletItr) { + public Iterator> call(Integer partition, + Iterator> filePartitionRecordKeyTripletItr) { return new LazyKeyCheckIterator(filePartitionRecordKeyTripletItr); } - class LazyKeyCheckIterator extends LazyIterableIterator, List> { + class LazyKeyCheckIterator extends LazyIterableIterator, List> { private HoodieKeyLookupHandle keyLookupHandle; @@ -70,9 +70,9 @@ protected void start() { } @Override - protected List computeNext() { + protected List computeNext() { - List ret = new ArrayList<>(); + List ret = new ArrayList<>(); try { // process one file in each go. while (inputItr.hasNext()) { @@ -88,7 +88,7 @@ protected List computeNext() { } // if continue on current file - if (keyLookupHandle.getPartitionPathFilePair().equals(partitionPathFilePair)) { + if (keyLookupHandle.getPartitionPathFileIDPair().equals(partitionPathFilePair)) { keyLookupHandle.addKey(recordKey); } else { // do the actual checking of file & break out diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieMetadataBloomIndexCheckFunction.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieMetadataBloomIndexCheckFunction.java new file mode 100644 index 0000000000000..32bca55099eda --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieMetadataBloomIndexCheckFunction.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.index.bloom; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.client.utils.LazyIterableIterator; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieIndexException; +import org.apache.hudi.index.HoodieIndexUtils; +import org.apache.hudi.io.HoodieKeyLookupResult; +import org.apache.hudi.table.HoodieTable; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.function.Function2; +import scala.Tuple2; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Spark Function2 implementation for checking bloom filters for the + * requested keys from the metadata table index. The bloom filter + * checking for keys and the actual file verification for the + * candidate keys is done in an iterative fashion. In each iteration, + * bloom filters are requested for a batch of partition files and the + * keys are checked against them. + */ +public class HoodieMetadataBloomIndexCheckFunction implements + Function2>, Iterator>> { + + private static final Logger LOG = LogManager.getLogger(HoodieMetadataBloomIndexCheckFunction.class); + + // Assuming each file bloom filter takes up 512K, sizing the max file count + // per batch so that the total fetched bloom filters would not cross 128 MB. + private static final long BLOOM_FILTER_CHECK_MAX_FILE_COUNT_PER_BATCH = 256; + private final HoodieTable hoodieTable; + + public HoodieMetadataBloomIndexCheckFunction(HoodieTable hoodieTable) { + this.hoodieTable = hoodieTable; + } + + @Override + public Iterator> call(Integer integer, Iterator> tuple2Iterator) throws Exception { + return new BloomIndexLazyKeyCheckIterator(tuple2Iterator); + } + + private class BloomIndexLazyKeyCheckIterator extends LazyIterableIterator, List> { + public BloomIndexLazyKeyCheckIterator(Iterator> tuple2Iterator) { + super(tuple2Iterator); + } + + @Override + protected void start() { + } + + @Override + protected List computeNext() { + // Partition path and file name pair to list of keys + final Map, List> fileToKeysMap = new HashMap<>(); + final Map fileIDBaseFileMap = new HashMap<>(); + final List resultList = new ArrayList<>(); + + while (inputItr.hasNext()) { + Tuple2 entry = inputItr.next(); + final String partitionPath = entry._2.getPartitionPath(); + final String fileId = entry._1; + if (!fileIDBaseFileMap.containsKey(fileId)) { + Option baseFile = hoodieTable.getBaseFileOnlyView().getLatestBaseFile(partitionPath, fileId); + if (!baseFile.isPresent()) { + throw new HoodieIndexException("Failed to find the base file for partition: " + partitionPath + + ", fileId: " + fileId); + } + fileIDBaseFileMap.put(fileId, baseFile.get()); + } + fileToKeysMap.computeIfAbsent(Pair.of(partitionPath, fileIDBaseFileMap.get(fileId).getFileName()), + k -> new ArrayList<>()).add(entry._2); + if (fileToKeysMap.size() > BLOOM_FILTER_CHECK_MAX_FILE_COUNT_PER_BATCH) { + break; + } + } + if (fileToKeysMap.isEmpty()) { + return Collections.emptyList(); + } + + List> partitionNameFileNameList = new ArrayList<>(fileToKeysMap.keySet()); + Map, ByteBuffer> fileToBloomFilterMap = + hoodieTable.getMetadataTable().getBloomFilters(partitionNameFileNameList); + + final AtomicInteger totalKeys = new AtomicInteger(0); + fileToKeysMap.forEach((partitionPathFileNamePair, hoodieKeyList) -> { + final String partitionPath = partitionPathFileNamePair.getLeft(); + final String fileName = partitionPathFileNamePair.getRight(); + final String fileId = FSUtils.getFileId(fileName); + ValidationUtils.checkState(!fileId.isEmpty()); + + if (!fileToBloomFilterMap.containsKey(partitionPathFileNamePair)) { + throw new HoodieIndexException("Failed to get the bloom filter for " + partitionPathFileNamePair); + } + final ByteBuffer fileBloomFilterByteBuffer = fileToBloomFilterMap.get(partitionPathFileNamePair); + + HoodieDynamicBoundedBloomFilter fileBloomFilter = + new HoodieDynamicBoundedBloomFilter(StandardCharsets.UTF_8.decode(fileBloomFilterByteBuffer).toString(), + BloomFilterTypeCode.DYNAMIC_V0); + + List candidateRecordKeys = new ArrayList<>(); + hoodieKeyList.forEach(hoodieKey -> { + totalKeys.incrementAndGet(); + if (fileBloomFilter.mightContain(hoodieKey.getRecordKey())) { + candidateRecordKeys.add(hoodieKey.getRecordKey()); + } + }); + + final HoodieBaseFile dataFile = fileIDBaseFileMap.get(fileId); + List matchingKeys = + HoodieIndexUtils.filterKeysFromFile(new Path(dataFile.getPath()), candidateRecordKeys, + hoodieTable.getHadoopConf()); + LOG.debug( + String.format("Total records (%d), bloom filter candidates (%d)/fp(%d), actual matches (%d)", + hoodieKeyList.size(), candidateRecordKeys.size(), + candidateRecordKeys.size() - matchingKeys.size(), matchingKeys.size())); + + resultList.add(new HoodieKeyLookupResult(fileId, partitionPath, dataFile.getCommitTime(), matchingKeys)); + }); + return resultList; + } + + @Override + protected void end() { + } + } +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java index bbb50d5cf6fff..1659fe016ca1d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/SparkHoodieBloomIndexHelper.java @@ -24,25 +24,23 @@ import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordLocation; -import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaPairRDD; import org.apache.hudi.data.HoodieJavaRDD; +import org.apache.hudi.io.HoodieKeyLookupResult; import org.apache.hudi.table.HoodieTable; - import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.Partitioner; import org.apache.spark.api.java.JavaRDD; +import scala.Tuple2; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import scala.Tuple2; - /** * Helper for {@link HoodieBloomIndex} containing Spark-specific logic. */ @@ -64,32 +62,44 @@ public static SparkHoodieBloomIndexHelper getInstance() { public HoodiePairData findMatchingFilesForRecordKeys( HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, HoodiePairData partitionRecordKeyPairs, - HoodieData> fileComparisonPairs, + HoodieData> fileComparisonPairs, Map> partitionToFileInfo, Map recordsPerPartition) { JavaRDD> fileComparisonsRDD = HoodieJavaRDD.getJavaRDD(fileComparisonPairs) .map(pair -> new Tuple2<>(pair.getLeft(), pair.getRight())); - Map comparisonsPerFileGroup = computeComparisonsPerFileGroup( - config, recordsPerPartition, partitionToFileInfo, fileComparisonsRDD, context); - int inputParallelism = - HoodieJavaPairRDD.getJavaPairRDD(partitionRecordKeyPairs).partitions().size(); + + int inputParallelism = HoodieJavaPairRDD.getJavaPairRDD(partitionRecordKeyPairs).partitions().size(); int joinParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism()); LOG.info("InputParallelism: ${" + inputParallelism + "}, IndexParallelism: ${" + config.getBloomIndexParallelism() + "}"); - if (config.useBloomIndexBucketizedChecking()) { + JavaRDD> keyLookupResultRDD; + if (config.isMetadataBloomFilterIndexEnabled()) { + // Step 1: Sort by file id + JavaRDD> sortedFileIdAndKeyPairs = + fileComparisonsRDD.sortBy(Tuple2::_1, true, joinParallelism); + + // Step 2: Use bloom filter to filter and the actual log file to get the record location + keyLookupResultRDD = sortedFileIdAndKeyPairs.mapPartitionsWithIndex( + new HoodieMetadataBloomIndexCheckFunction(hoodieTable), true); + } else if (config.useBloomIndexBucketizedChecking()) { + Map comparisonsPerFileGroup = computeComparisonsPerFileGroup( + config, recordsPerPartition, partitionToFileInfo, fileComparisonsRDD, context); Partitioner partitioner = new BucketizedBloomCheckPartitioner(joinParallelism, comparisonsPerFileGroup, config.getBloomIndexKeysPerBucket()); - fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) - .repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2); + keyLookupResultRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t)) + .repartitionAndSortWithinPartitions(partitioner) + .map(Tuple2::_2) + .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true); } else { - fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, joinParallelism); + keyLookupResultRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, joinParallelism) + .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true); } - return HoodieJavaPairRDD.of(fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true) - .flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0) + return HoodieJavaPairRDD.of(keyLookupResultRDD.flatMap(List::iterator) + .filter(lr -> lr.getMatchingRecordKeys().size() > 0) .flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream() .map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()))) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/columnstats/ColumnStatsIndexHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/columnstats/ColumnStatsIndexHelper.java index d92bac4d84714..521bdb20c58fc 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/columnstats/ColumnStatsIndexHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/columnstats/ColumnStatsIndexHelper.java @@ -29,7 +29,6 @@ import org.apache.hudi.exception.HoodieException; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.parquet.io.api.Binary; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -62,6 +61,7 @@ import javax.annotation.Nonnull; import java.io.IOException; import java.math.BigDecimal; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -200,7 +200,7 @@ public static Dataset buildColumnStatsTableFor( indexRow.add(minMaxValue.getLeft()); // min indexRow.add(minMaxValue.getRight()); // max - indexRow.add(colMetadata.getNumNulls()); + indexRow.add(colMetadata.getNullCount()); }); return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow)); @@ -262,10 +262,10 @@ public static void updateColumnStatsIndexFor( // │ │ ├── .parquet // │ │ └── ... // - // If index is currently empty (no persisted tables), we simply create one - // using clustering operation's commit instance as it's name Path newIndexTablePath = new Path(indexFolderPath, commitTime); + // If index is currently empty (no persisted tables), we simply create one + // using clustering operation's commit instance as it's name if (!fs.exists(new Path(indexFolderPath))) { newColStatsIndexDf.repartition(1) .write() @@ -326,6 +326,9 @@ public static void updateColumnStatsIndexFor( .repartition(1) .write() .format("parquet") + // NOTE: We intend to potentially overwrite index-table from the previous Clustering + // operation that has failed to commit + .mode("overwrite") .save(newIndexTablePath.toString()); // Clean up residual col-stats-index tables that have might have been dangling since @@ -419,9 +422,8 @@ private static String composeZIndexColName(String col, String statName) { ); } else if (colType instanceof StringType) { return Pair.of( - new String(((Binary) colMetadata.getMinValue()).getBytes()), - new String(((Binary) colMetadata.getMaxValue()).getBytes()) - ); + colMetadata.getMinValue().toString(), + colMetadata.getMaxValue().toString()); } else if (colType instanceof DecimalType) { return Pair.of( new BigDecimal(colMetadata.getMinValue().toString()), @@ -444,8 +446,8 @@ private static String composeZIndexColName(String col, String statName) { new Float(colMetadata.getMaxValue().toString())); } else if (colType instanceof BinaryType) { return Pair.of( - ((Binary) colMetadata.getMinValue()).getBytes(), - ((Binary) colMetadata.getMaxValue()).getBytes()); + ((ByteBuffer) colMetadata.getMinValue()).array(), + ((ByteBuffer) colMetadata.getMaxValue()).array()); } else if (colType instanceof BooleanType) { return Pair.of( Boolean.valueOf(colMetadata.getMinValue().toString()), diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java index e940c0b8211c1..fc73a0aed7d70 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/hbase/SparkHoodieHBaseIndex.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -85,8 +86,7 @@ /** * Hoodie Index implementation backed by HBase. */ -public class SparkHoodieHBaseIndex> - extends HoodieIndex>, JavaRDD, JavaRDD> { +public class SparkHoodieHBaseIndex extends HoodieIndex { public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances"; public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled"; @@ -203,15 +203,13 @@ private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String comm /** * Function that tags each HoodieRecord with an existing location, if known. */ - private Function2>, Iterator>> locationTagFunction( + private Function2>, Iterator>> locationTagFunction( HoodieTableMetaClient metaClient) { // `multiGetBatchSize` is intended to be a batch per 100ms. To create a rate limiter that measures // operations per second, we need to multiply `multiGetBatchSize` by 10. Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize(); - return (Function2>, Iterator>>) (partitionNum, - hoodieRecordIterator) -> { - + return (partitionNum, hoodieRecordIterator) -> { boolean updatePartitionPath = config.getHbaseIndexUpdatePartitionPath(); RateLimiter limiter = RateLimiter.create(multiGetBatchSize * 10, TimeUnit.SECONDS); // Grab the global HBase connection @@ -220,7 +218,7 @@ private Function2>, Iterator>> hbaseConnection = getHBaseConnection(); } } - List> taggedRecords = new ArrayList<>(); + List> taggedRecords = new ArrayList<>(); try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName))) { List statements = new ArrayList<>(); List currentBatchOfRecords = new LinkedList<>(); @@ -256,19 +254,19 @@ private Function2>, Iterator>> // check whether to do partition change processing if (updatePartitionPath && !partitionPath.equals(currentRecord.getPartitionPath())) { // delete partition old data record - HoodieRecord emptyRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), + HoodieRecord emptyRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload()); emptyRecord.unseal(); emptyRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); emptyRecord.seal(); // insert partition new data record - currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), currentRecord.getPartitionPath()), - currentRecord.getData()); + currentRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), currentRecord.getPartitionPath()), + (HoodieRecordPayload) currentRecord.getData()); taggedRecords.add(emptyRecord); taggedRecords.add(currentRecord); } else { - currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), - currentRecord.getData()); + currentRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath), + (HoodieRecordPayload) currentRecord.getData()); currentRecord.unseal(); currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); currentRecord.seal(); @@ -294,8 +292,8 @@ private Result[] doGet(HTable hTable, List keys, RateLimiter limiter) throw } @Override - public HoodieData> tagLocation( - HoodieData> records, HoodieEngineContext context, + public HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, HoodieTable hoodieTable) { return HoodieJavaRDD.of(HoodieJavaRDD.getJavaRDD(records) .mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true)); @@ -303,7 +301,7 @@ public HoodieData> tagLocation( private Function2, Iterator> updateLocationFunction() { - return (Function2, Iterator>) (partition, statusIterator) -> { + return (partition, statusIterator) -> { List writeStatusList = new ArrayList<>(); // Grab the global HBase connection @@ -385,7 +383,7 @@ private void doMutations(BufferedMutator mutator, List mutations, Rate mutations.clear(); } - public Map mapFileWithInsertsToUniquePartition(JavaRDD writeStatusRDD) { + Map mapFileWithInsertsToUniquePartition(JavaRDD writeStatusRDD) { final Map fileIdPartitionMap = new HashMap<>(); int partitionIndex = 0; // Map each fileId that has inserts to a unique partition Id. This will be used while @@ -466,7 +464,7 @@ private void acquireQPSResourcesAndSetBatchSize(final Option desiredQPSFr } } - public Tuple2 getHBasePutAccessParallelism(final JavaRDD writeStatusRDD) { + Tuple2 getHBasePutAccessParallelism(final JavaRDD writeStatusRDD) { final JavaPairRDD insertOnlyWriteStatusRDD = writeStatusRDD .filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1)); return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2)); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowCreateHandle.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowCreateHandle.java index 3566a8d8f4120..5cdb2ff68fc63 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowCreateHandle.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowCreateHandle.java @@ -45,7 +45,7 @@ import java.util.concurrent.atomic.AtomicLong; /** - * Create handle with InternalRow for datasource implemention of bulk insert. + * Create handle with InternalRow for datasource implementation of bulk insert. */ public class HoodieRowCreateHandle implements Serializable { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java index 0b30f19899ec3..fe03f60ee816c 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/BuiltinKeyGenerator.java @@ -18,28 +18,26 @@ package org.apache.hudi.keygen; +import org.apache.avro.generic.GenericRecord; import org.apache.hudi.ApiMaturityLevel; -import org.apache.hudi.AvroConversionHelper; +import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.HoodieSparkUtils; import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.client.utils.SparkRowSerDe; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieKeyException; - -import org.apache.avro.generic.GenericRecord; import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StructType; +import scala.Function1; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; -import scala.Function1; - /** * Base class for the built-in key generators. Contains methods structured for * code reuse amongst them. @@ -48,7 +46,7 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp private static final String STRUCT_NAME = "hoodieRowTopLevelField"; private static final String NAMESPACE = "hoodieRow"; - private transient Function1 converterFn = null; + private transient Function1 converterFn = null; private SparkRowSerDe sparkRowSerDe; protected StructType structType; @@ -69,10 +67,9 @@ protected BuiltinKeyGenerator(TypedProperties config) { @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) public String getRecordKey(Row row) { if (null == converterFn) { - converterFn = AvroConversionHelper.createConverterToAvro(row.schema(), STRUCT_NAME, NAMESPACE); + converterFn = AvroConversionUtils.createConverterToAvro(row.schema(), STRUCT_NAME, NAMESPACE); } - GenericRecord genericRecord = (GenericRecord) converterFn.apply(row); - return getKey(genericRecord).getRecordKey(); + return getKey(converterFn.apply(row)).getRecordKey(); } /** @@ -84,10 +81,9 @@ public String getRecordKey(Row row) { @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) public String getPartitionPath(Row row) { if (null == converterFn) { - converterFn = AvroConversionHelper.createConverterToAvro(row.schema(), STRUCT_NAME, NAMESPACE); + converterFn = AvroConversionUtils.createConverterToAvro(row.schema(), STRUCT_NAME, NAMESPACE); } - GenericRecord genericRecord = (GenericRecord) converterFn.apply(row); - return getKey(genericRecord).getPartitionPath(); + return getKey(converterFn.apply(row)).getPartitionPath(); } /** diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java index 8d02ce60f3bec..2e2167f9379f0 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/ComplexKeyGenerator.java @@ -37,10 +37,14 @@ public class ComplexKeyGenerator extends BuiltinKeyGenerator { public ComplexKeyGenerator(TypedProperties props) { super(props); - this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()) - .split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); - this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()) - .split(",")).map(String::trim).filter(s -> !s.isEmpty()).collect(Collectors.toList()); + this.recordKeyFields = Arrays.stream(props.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()).split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + this.partitionPathFields = Arrays.stream(props.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()).split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); complexAvroKeyGenerator = new ComplexAvroKeyGenerator(props); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java index 1664c86f9baa5..032c750f03240 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/NonpartitionedKeyGenerator.java @@ -18,9 +18,10 @@ package org.apache.hudi.keygen; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import org.apache.avro.generic.GenericRecord; import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.StructType; @@ -31,7 +32,7 @@ import java.util.stream.Collectors; /** - * Simple Key generator for unpartitioned Hive Tables. + * Simple Key generator for non-partitioned Hive Tables. */ public class NonpartitionedKeyGenerator extends BuiltinKeyGenerator { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java index 24f6e7a4fa4b5..6a28fbe9501a9 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java @@ -40,9 +40,9 @@ import scala.Option; -import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH; import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR; import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER; +import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH; import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER; /** @@ -230,9 +230,10 @@ public static Object getNestedFieldVal(Row row, List positions) { /** * Generate the tree style positions for the field requested for as per the defined struct type. - * @param structType schema of interest - * @param field field of interest for which the positions are requested for - * @param isRecordKey {@code true} if the field requested for is a record key. {@code false} incase of a partition path. + * + * @param structType schema of interest + * @param field field of interest for which the positions are requested for + * @param isRecordKey {@code true} if the field requested for is a record key. {@code false} in case of a partition path. * @return the positions of the field as per the struct type. */ public static List getNestedFieldIndices(StructType structType, String field, boolean isRecordKey) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java index ccb258a8cdc61..c905f92c2eac0 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java @@ -18,24 +18,23 @@ package org.apache.hudi.metadata; -import org.apache.avro.specific.SpecificRecordBase; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.metrics.Registry; -import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.metrics.DistributedRegistry; +import org.apache.avro.specific.SpecificRecordBase; import org.apache.hadoop.conf.Configuration; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -43,6 +42,7 @@ import java.io.IOException; import java.util.List; +import java.util.Map; public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter { @@ -51,8 +51,8 @@ public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad /** * Return a Spark based implementation of {@code HoodieTableMetadataWriter} which can be used to * write to the metadata table. - * - * If the metadata table does not exist, an attempt is made to bootstrap it but there is no guarantted that + *

+ * If the metadata table does not exist, an attempt is made to bootstrap it but there is no guaranteed that * table will end up bootstrapping at this time. * * @param conf @@ -121,11 +121,12 @@ protected void initialize(HoodieEngineContext eng } } - protected void commit(HoodieData hoodieDataRecords, String partitionName, String instantTime, boolean canTriggerTableService) { + @Override + protected void commit(String instantTime, Map> partitionRecordsMap, boolean canTriggerTableService) { ValidationUtils.checkState(metadataMetaClient != null, "Metadata table is not fully initialized yet."); ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled"); - JavaRDD records = (JavaRDD) hoodieDataRecords.get(); - JavaRDD recordRDD = prepRecords(records, partitionName, 1); + HoodieData preppedRecords = prepRecords(partitionRecordsMap); + JavaRDD preppedRecordRDD = HoodieJavaRDD.getJavaRDD(preppedRecords); try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) { if (canTriggerTableService) { @@ -150,7 +151,7 @@ protected void commit(HoodieData hoodieDataRecords, String partiti HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant); metadataMetaClient.reloadActiveTimeline(); } - List statuses = writeClient.upsertPreppedRecords(recordRDD, instantTime).collect(); + List statuses = writeClient.upsertPreppedRecords(preppedRecordRDD, instantTime).collect(); statuses.forEach(writeStatus -> { if (writeStatus.hasErrors()) { throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime); @@ -168,20 +169,4 @@ protected void commit(HoodieData hoodieDataRecords, String partiti // Update total size of the metadata and count of base/log files metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata)); } - - /** - * Tag each record with the location in the given partition. - * - * The record is tagged with respective file slice's location based on its record key. - */ - private JavaRDD prepRecords(JavaRDD recordsRDD, String partitionName, int numFileGroups) { - List fileSlices = HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient, partitionName); - ValidationUtils.checkArgument(fileSlices.size() == numFileGroups, String.format("Invalid number of file groups: found=%d, required=%d", fileSlices.size(), numFileGroups)); - - return recordsRDD.map(r -> { - FileSlice slice = fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), numFileGroups)); - r.setCurrentLocation(new HoodieRecordLocation(slice.getBaseInstantTime(), slice.getFileId())); - return r; - }); - } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java index aa9a924ed6925..31bd436612c11 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java @@ -25,6 +25,7 @@ import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.avro.model.HoodieRollbackPlan; import org.apache.hudi.avro.model.HoodieSavepointMetadata; @@ -58,7 +59,7 @@ import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor; import org.apache.hudi.table.action.clean.CleanActionExecutor; import org.apache.hudi.table.action.clean.CleanPlanActionExecutor; -import org.apache.hudi.table.action.cluster.SparkClusteringPlanActionExecutor; +import org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor; import org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor; import org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor; @@ -74,6 +75,7 @@ import org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor; import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor; +import org.apache.hudi.table.action.rollback.RestorePlanActionExecutor; import org.apache.hudi.table.action.savepoint.SavepointActionExecutor; import org.apache.avro.Schema; @@ -184,13 +186,6 @@ private void updateColumnsStatsIndex( String basePath = metaClient.getBasePath(); String indexPath = metaClient.getColumnStatsIndexPath(); - List completedCommits = - metaClient.getCommitsTimeline() - .filterCompletedInstants() - .getInstants() - .map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - List touchedFiles = updatedFilesStats.stream() .map(s -> new Path(basePath, s.getPath()).toString()) @@ -214,6 +209,13 @@ private void updateColumnsStatsIndex( new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields() ); + List completedCommits = + metaClient.getCommitsTimeline() + .filterCompletedInstants() + .getInstants() + .map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); + ColumnStatsIndexHelper.updateColumnStatsIndexFor( sparkEngineContext.getSqlContext().sparkSession(), AvroConversionUtils.convertAvroSchemaToStructType(tableWriteSchema), @@ -242,7 +244,7 @@ public HoodieWriteMetadata> compact( public Option scheduleClustering(HoodieEngineContext context, String instantTime, Option> extraMetadata) { - return new SparkClusteringPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute(); + return new ClusteringPlanActionExecutor<>(context, config,this, instantTime, extraMetadata).execute(); } @Override @@ -258,6 +260,7 @@ public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngine @Override public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { + new RestorePlanActionExecutor<>(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); new CopyOnWriteRestoreActionExecutor(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); } @@ -353,4 +356,8 @@ public HoodieRestoreMetadata restore(HoodieEngineContext context, String restore return new CopyOnWriteRestoreActionExecutor(context, config, this, restoreInstantTime, instantToRestore).execute(); } + @Override + public Option scheduleRestore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) { + return new RestorePlanActionExecutor(context, config, this, restoreInstantTime, instantToRestore).execute(); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java index 75af5d0f685fc..334efa7fc91f4 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java @@ -52,6 +52,7 @@ import org.apache.hudi.table.action.restore.MergeOnReadRestoreActionExecutor; import org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor; import org.apache.hudi.table.action.rollback.MergeOnReadRollbackActionExecutor; +import org.apache.hudi.table.action.rollback.RestorePlanActionExecutor; import org.apache.spark.api.java.JavaRDD; @@ -150,6 +151,7 @@ public HoodieBootstrapWriteMetadata> bootstrap(HoodieEngine @Override public void rollbackBootstrap(HoodieEngineContext context, String instantTime) { + new RestorePlanActionExecutor<>(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); new MergeOnReadRestoreActionExecutor(context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute(); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java index 35c9ab3a0fe94..bb8c95d745ab1 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkTable.java @@ -18,8 +18,6 @@ package org.apache.hudi.table; -import org.apache.avro.specific.SpecificRecordBase; -import org.apache.hadoop.fs.Path; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.data.HoodieData; @@ -39,6 +37,11 @@ import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.action.HoodieWriteMetadata; + +import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hadoop.fs.Path; +import org.apache.spark.TaskContext; +import org.apache.spark.TaskContext$; import org.apache.spark.api.java.JavaRDD; import java.io.IOException; @@ -63,7 +66,8 @@ public static HoodieSparkTable create(HoodieW HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(config.getBasePath()) .setLoadActiveTimelineOnLoad(true).setConsistencyGuardConfig(config.getConsistencyGuardConfig()) - .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))).build(); + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))) + .setFileSystemRetryConfig(config.getFileSystemRetryConfig()).build(); return HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient, refreshTimeline); } @@ -110,8 +114,8 @@ protected HoodieIndex getIndex(HoodieWriteConfig config, HoodieEngineContext con * @return instance of {@link HoodieTableMetadataWriter} */ @Override - public Option getMetadataWriter(String triggeringInstantTimestamp, - Option actionMetadata) { + public Option getMetadataWriter(String triggeringInstantTimestamp, + Option actionMetadata) { if (config.isMetadataTableEnabled()) { // Create the metadata table writer. First time after the upgrade this creation might trigger // metadata table bootstrapping. Bootstrapping process could fail and checking the table @@ -131,4 +135,10 @@ public Option getMetad return Option.empty(); } + + @Override + public Runnable getPreExecuteRunnable() { + final TaskContext taskContext = TaskContext.get(); + return () -> TaskContext$.MODULE$.setTaskContext(taskContext); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java index 75daca739c8f5..237fe6cf84849 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/BootstrapMetadataHandler.java @@ -26,10 +26,11 @@ */ public interface BootstrapMetadataHandler { /** - * Execute bootstrap with only metatata. + * Execute bootstrap with only metadata. + * * @param srcPartitionPath source partition path. - * @param partitionPath destination partition path. - * @param keyGenerator key generator to use. + * @param partitionPath destination partition path. + * @param keyGenerator key generator to use. * @return the {@link BootstrapWriteStatus} which has the result of execution. */ BootstrapWriteStatus runMetadataBootstrap(String srcPartitionPath, String partitionPath, KeyGeneratorInterface keyGenerator); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java index 9587c5b30cb74..e3d0e9b3c69d4 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/OrcBootstrapMetadataHandler.java @@ -21,6 +21,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieFileStatus; import org.apache.hudi.client.bootstrap.BootstrapRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.AvroOrcUtils; @@ -28,7 +29,6 @@ import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.execution.SparkBoundedInMemoryExecutor; import org.apache.hudi.io.HoodieBootstrapHandle; import org.apache.hudi.keygen.KeyGeneratorInterface; import org.apache.hudi.table.HoodieTable; @@ -67,15 +67,15 @@ void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, Path so Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf())); TypeDescription orcSchema = orcReader.getSchema(); try (RecordReader reader = orcReader.rows(new Reader.Options(table.getHadoopConf()).schema(orcSchema))) { - wrapper = new SparkBoundedInMemoryExecutor(config, + wrapper = new BoundedInMemoryExecutor(config.getWriteBufferLimitBytes(), new OrcReaderIterator(reader, avroSchema, orcSchema), new BootstrapRecordConsumer(bootstrapHandle), inp -> { String recKey = keyGenerator.getKey(inp).getRecordKey(); GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA); gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey); BootstrapRecordPayload payload = new BootstrapRecordPayload(gr); - HoodieRecord rec = new HoodieRecord(new HoodieKey(recKey, partitionPath), payload); + HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload); return rec; - }); + }, table.getPreExecuteRunnable()); wrapper.execute(); } catch (Exception e) { throw new HoodieException(e); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java index 058c2d4267abb..d07ea771bc557 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/ParquetBootstrapMetadataHandler.java @@ -21,13 +21,13 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieFileStatus; import org.apache.hudi.client.bootstrap.BootstrapRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.ParquetReaderIterator; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.execution.SparkBoundedInMemoryExecutor; import org.apache.hudi.io.HoodieBootstrapHandle; import org.apache.hudi.keygen.KeyGeneratorInterface; import org.apache.hudi.table.HoodieTable; @@ -71,15 +71,15 @@ void executeBootstrap(HoodieBootstrapHandle bootstrapHandle, try { ParquetReader reader = AvroParquetReader.builder(sourceFilePath).withConf(table.getHadoopConf()).build(); - wrapper = new SparkBoundedInMemoryExecutor(config, + wrapper = new BoundedInMemoryExecutor(config.getWriteBufferLimitBytes(), new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> { String recKey = keyGenerator.getKey(inp).getRecordKey(); GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA); gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey); BootstrapRecordPayload payload = new BootstrapRecordPayload(gr); - HoodieRecord rec = new HoodieRecord(new HoodieKey(recKey, partitionPath), payload); + HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload); return rec; - }); + }, table.getPreExecuteRunnable()); wrapper.execute(); } catch (Exception e) { throw new HoodieException(e); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java index ea558c39275da..a970e8f0f97d3 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java @@ -113,9 +113,9 @@ public HoodieBootstrapWriteMetadata execute() { validate(); try { HoodieTableMetaClient metaClient = table.getMetaClient(); - Option completetedInstant = + Option completedInstant = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); - ValidationUtils.checkArgument(!completetedInstant.isPresent(), + ValidationUtils.checkArgument(!completedInstant.isPresent(), "Active Timeline is expected to be empty for bootstrap to be performed. " + "If you want to re-bootstrap, please rollback bootstrap first !!"); Map>>> partitionSelections = listAndProcessSourcePartitions(); @@ -181,6 +181,11 @@ public HoodieWriteMetadata> execute(JavaRDD return null; } + @Override + protected void setCommitMetadata(HoodieWriteMetadata> result) { + result.setCommitMetadata(Option.of(new HoodieCommitMetadata())); + } + @Override protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { // Perform bootstrap index write and then commit. Make sure both record-key and bootstrap-index diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkClusteringPlanActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkClusteringPlanActionExecutor.java deleted file mode 100644 index 81a0a74aee1d3..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkClusteringPlanActionExecutor.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.action.cluster; - -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.table.HoodieTable; -import org.apache.spark.api.java.JavaRDD; - -import java.util.Map; - -@SuppressWarnings("checkstyle:LineLength") -public class SparkClusteringPlanActionExecutor extends - BaseClusteringPlanActionExecutor>, JavaRDD, JavaRDD> { - - public SparkClusteringPlanActionExecutor(HoodieEngineContext context, - HoodieWriteConfig config, - HoodieTable>, JavaRDD, JavaRDD> table, - String instantTime, - Option> extraMetadata) { - super(context, config, table, instantTime, extraMetadata); - } -} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java index 5b0224b0f0050..594a910428aad 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/cluster/SparkExecuteClusteringCommitActionExecutor.java @@ -18,111 +18,48 @@ package org.apache.hudi.table.action.cluster; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.avro.model.HoodieClusteringGroup; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieFileGroupId; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; -import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.ClusteringUtils; -import org.apache.hudi.common.util.CommitUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.exception.HoodieClusteringException; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy; import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor; -import org.apache.avro.Schema; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - public class SparkExecuteClusteringCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private static final Logger LOG = LogManager.getLogger(SparkExecuteClusteringCommitActionExecutor.class); private final HoodieClusteringPlan clusteringPlan; public SparkExecuteClusteringCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime) { super(context, config, table, instantTime, WriteOperationType.CLUSTER); - this.clusteringPlan = ClusteringUtils.getClusteringPlan(table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime)) - .map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException("Unable to read clustering plan for instant: " + instantTime)); + this.clusteringPlan = ClusteringUtils.getClusteringPlan( + table.getMetaClient(), HoodieTimeline.getReplaceCommitRequestedInstant(instantTime)) + .map(Pair::getRight).orElseThrow(() -> new HoodieClusteringException( + "Unable to read clustering plan for instant: " + instantTime)); } @Override public HoodieWriteMetadata> execute() { - HoodieInstant instant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime); - // Mark instant as clustering inflight - table.getActiveTimeline().transitionReplaceRequestedToInflight(instant, Option.empty()); - table.getMetaClient().reloadActiveTimeline(); - - final Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); - HoodieWriteMetadata> writeMetadata = ((ClusteringExecutionStrategy>, JavaRDD, JavaRDD>) - ReflectionUtils.loadClass(config.getClusteringExecutionStrategyClass(), - new Class[] {HoodieTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, table, context, config)) - .performClustering(clusteringPlan, schema, instantTime); - JavaRDD writeStatusRDD = writeMetadata.getWriteStatuses(); - JavaRDD statuses = updateIndex(writeStatusRDD, writeMetadata); - writeMetadata.setWriteStats(statuses.map(WriteStatus::getStat).collect()); - writeMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(writeMetadata)); - commitOnAutoCommit(writeMetadata); - if (!writeMetadata.getCommitMetadata().isPresent()) { - HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeMetadata.getWriteStats().get(), writeMetadata.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); - writeMetadata.setCommitMetadata(Option.of(commitMetadata)); - } - return writeMetadata; - } - - /** - * Validate actions taken by clustering. In the first implementation, we validate at least one new file is written. - * But we can extend this to add more validation. E.g. number of records read = number of records written etc. - * We can also make these validations in BaseCommitActionExecutor to reuse pre-commit hooks for multiple actions. - */ - private void validateWriteResult(HoodieWriteMetadata> writeMetadata) { - if (writeMetadata.getWriteStatuses().isEmpty()) { - throw new HoodieClusteringException("Clustering plan produced 0 WriteStatus for " + instantTime - + " #groups: " + clusteringPlan.getInputGroups().size() + " expected at least " - + clusteringPlan.getInputGroups().stream().mapToInt(HoodieClusteringGroup::getNumOutputFileGroups).sum() - + " write statuses"); - } + HoodieWriteMetadata> writeMetadata = executeClustering(clusteringPlan); + JavaRDD transformedWriteStatuses = HoodieJavaRDD.getJavaRDD(writeMetadata.getWriteStatuses()); + return writeMetadata.clone(transformedWriteStatuses); } @Override protected String getCommitActionType() { return HoodieTimeline.REPLACE_COMMIT_ACTION; } - - @Override - protected Map> getPartitionToReplacedFileIds(HoodieWriteMetadata> writeMetadata) { - Set newFilesWritten = writeMetadata.getWriteStats().get().stream() - .map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet()); - // for the below execution strategy, new filegroup id would be same as old filegroup id - if (SparkSingleFileSortExecutionStrategy.class.getName().equals(config.getClusteringExecutionStrategyClass())) { - return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) - .collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList()))); - } - return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan) - .filter(fg -> !newFilesWritten.contains(fg)) - .collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList()))); - } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java index c551310bafdd1..ba3b0be1641ee 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java @@ -20,16 +20,15 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.utils.SparkMemoryUtils; -import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.client.utils.SparkValidatorUtils; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat; -import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -44,9 +43,9 @@ import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.execution.SparkLazyInsertIterable; import org.apache.hudi.io.CreateHandleFactory; +import org.apache.hudi.io.HoodieConcatHandle; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.HoodieSortedMergeHandle; -import org.apache.hudi.io.HoodieConcatHandle; import org.apache.hudi.keygen.BaseKeyGenerator; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.table.HoodieSparkTable; @@ -68,14 +67,14 @@ import java.nio.charset.StandardCharsets; import java.time.Duration; import java.time.Instant; -import java.util.stream.Collectors; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; -import java.util.Set; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; import static org.apache.hudi.common.util.ClusteringUtils.getAllFileGroupsInPendingClusteringPlans; @@ -83,15 +82,14 @@ public abstract class BaseSparkCommitActionExecutor>, JavaRDD, JavaRDD, HoodieWriteMetadata> { private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class); - protected Option keyGeneratorOpt = Option.empty(); + protected final Option keyGeneratorOpt; public BaseSparkCommitActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType) { - super(context, config, table, instantTime, operationType, Option.empty()); - initKeyGenIfNeeded(config.populateMetaFields()); + this(context, config, table, instantTime, operationType, Option.empty()); } public BaseSparkCommitActionExecutor(HoodieEngineContext context, @@ -101,16 +99,12 @@ public BaseSparkCommitActionExecutor(HoodieEngineContext context, WriteOperationType operationType, Option extraMetadata) { super(context, config, table, instantTime, operationType, extraMetadata); - initKeyGenIfNeeded(config.populateMetaFields()); - } - - private void initKeyGenIfNeeded(boolean populateMetaFields) { - if (!populateMetaFields) { - try { - keyGeneratorOpt = Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))); - } catch (IOException e) { - throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e); - } + try { + keyGeneratorOpt = config.populateMetaFields() + ? Option.empty() + : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(this.config.getProps())); + } catch (IOException e) { + throw new HoodieIOException("Only BaseKeyGenerators are supported when meta columns are disabled ", e); } } @@ -126,7 +120,7 @@ private JavaRDD> clusteringHandleUpdate(JavaRDD> if (fileGroupsWithUpdatesAndPendingClustering.isEmpty()) { return recordsAndPendingClusteringFileGroups.getLeft(); } - // there are filegroups pending clustering and receiving updates, so rollback the pending clustering instants + // there are file groups pending clustering and receiving updates, so rollback the pending clustering instants // there could be race condition, for example, if the clustering completes after instants are fetched but before rollback completed if (config.isRollbackPendingClustering()) { Set pendingClusteringInstantsToRollback = getAllFileGroupsInPendingClusteringPlans(table.getMetaClient()).entrySet().stream() @@ -154,19 +148,22 @@ public HoodieWriteMetadata> execute(JavaRDD LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel()); } - WorkloadProfile profile = null; + WorkloadProfile workloadProfile = null; if (isWorkloadProfileNeeded()) { context.setJobStatus(this.getClass().getSimpleName(), "Building workload profile"); - profile = new WorkloadProfile(buildProfile(inputRecordsRDD), operationType); - LOG.info("Workload profile :" + profile); - saveWorkloadProfileMetadataToInflight(profile, instantTime); + workloadProfile = new WorkloadProfile(buildProfile(inputRecordsRDD), operationType, table.getIndex().canIndexLogFiles()); + LOG.info("Input workload profile :" + workloadProfile); + } + + // partition using the insert partitioner + final Partitioner partitioner = getPartitioner(workloadProfile); + if (isWorkloadProfileNeeded()) { + saveWorkloadProfileMetadataToInflight(workloadProfile, instantTime); } // handle records update with clustering JavaRDD> inputRecordsRDDWithClusteringUpdate = clusteringHandleUpdate(inputRecordsRDD); - // partition using the insert partitioner - final Partitioner partitioner = getPartitioner(profile); context.setJobStatus(this.getClass().getSimpleName(), "Doing partition and writing data"); JavaRDD> partitionedRecords = partition(inputRecordsRDDWithClusteringUpdate, partitioner); JavaRDD writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> { @@ -273,6 +270,13 @@ protected String getCommitActionType() { return table.getMetaClient().getCommitActionType(); } + @Override + protected void setCommitMetadata(HoodieWriteMetadata> result) { + result.setCommitMetadata(Option.of(CommitUtils.buildMetadata(result.getWriteStatuses().map(WriteStatus::getStat).collect(), + result.getPartitionToReplaceFileIds(), + extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()))); + } + @Override protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { context.setJobStatus(this.getClass().getSimpleName(), "Commit write status collect"); @@ -288,8 +292,7 @@ protected void commit(Option> extraMetadata, HoodieWriteMeta finalizeWrite(instantTime, writeStats, result); try { HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); - HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), - extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType()); + HoodieCommitMetadata metadata = result.getCommitMetadata().get(); writeTableMetadata(metadata, actionType); activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java index 71da2244db56f..65a45e1c6a047 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java @@ -74,7 +74,7 @@ public SparkBucketIndexPartitioner(WorkloadProfile profile, " Bucket index partitioner should only be used by BucketIndex other than " + table.getIndex().getClass().getSimpleName()); } - this.numBuckets = ((HoodieBucketIndex) table.getIndex()).getNumBuckets(); + this.numBuckets = ((HoodieBucketIndex) table.getIndex()).getNumBuckets(); this.indexKeyField = config.getBucketIndexHashField(); this.totalPartitionPaths = profile.getPartitionPaths().size(); partitionPaths = new ArrayList<>(profile.getPartitionPaths()); @@ -90,7 +90,7 @@ public SparkBucketIndexPartitioner(WorkloadProfile profile, private void assignUpdates(WorkloadProfile profile) { updatePartitionPathFileIds = new HashMap<>(); // each update location gets a partition - Set> partitionStatEntries = profile.getPartitionPathStatMap() + Set> partitionStatEntries = profile.getInputPartitionPathStatMap() .entrySet(); for (Entry partitionStat : partitionStatEntries) { if (!updatePartitionPathFileIds.containsKey(partitionStat.getKey())) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertCommitActionExecutor.java index 2b00d47b01564..f4f1d3ad06ccf 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertCommitActionExecutor.java @@ -29,6 +29,7 @@ import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.spark.api.java.JavaRDD; import java.util.Map; @@ -36,17 +37,17 @@ public class SparkBulkInsertCommitActionExecutor> extends BaseSparkCommitActionExecutor { private final JavaRDD> inputRecordsRDD; - private final Option> bulkInsertPartitioner; + private final Option>>> bulkInsertPartitioner; public SparkBulkInsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD> inputRecordsRDD, - Option> bulkInsertPartitioner) { + Option>>> bulkInsertPartitioner) { this(context, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty()); } public SparkBulkInsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD> inputRecordsRDD, - Option> bulkInsertPartitioner, + Option>>> bulkInsertPartitioner, Option> extraMetadata) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata); this.inputRecordsRDD = inputRecordsRDD; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java index 4644d29e00fa1..d0c5ddef5e71d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java @@ -41,12 +41,12 @@ import java.util.stream.IntStream; /** - * A spark implementation of {@link AbstractBulkInsertHelper}. + * A spark implementation of {@link BaseBulkInsertHelper}. * * @param */ @SuppressWarnings("checkstyle:LineLength") -public class SparkBulkInsertHelper extends AbstractBulkInsertHelper>, +public class SparkBulkInsertHelper extends BaseBulkInsertHelper>, JavaRDD, JavaRDD, R> { private SparkBulkInsertHelper() { @@ -67,7 +67,7 @@ public HoodieWriteMetadata> bulkInsert(final JavaRDD>, JavaRDD, JavaRDD, R> executor, final boolean performDedupe, - final Option> userDefinedBulkInsertPartitioner) { + final Option>>> userDefinedBulkInsertPartitioner) { HoodieWriteMetadata result = new HoodieWriteMetadata(); //transition bulk_insert state to inflight @@ -88,7 +88,7 @@ public JavaRDD bulkInsert(JavaRDD> inputRecords, HoodieTable>, JavaRDD, JavaRDD> table, HoodieWriteConfig config, boolean performDedupe, - Option> userDefinedBulkInsertPartitioner, + Option>>> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) { diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertPreppedCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertPreppedCommitActionExecutor.java index e6b6809498e29..28d8cb0b26422 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertPreppedCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertPreppedCommitActionExecutor.java @@ -26,22 +26,22 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.BulkInsertPartitioner; - +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.spark.api.java.JavaRDD; public class SparkBulkInsertPreppedCommitActionExecutor> extends BaseSparkCommitActionExecutor { private final JavaRDD> preppedInputRecordRdd; - private final Option> userDefinedBulkInsertPartitioner; + private final Option>>> userDefinedBulkInsertPartitioner; public SparkBulkInsertPreppedCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD> preppedInputRecordRdd, - Option> userDefinedBulkInsertPartitioner) { + Option>>> userDefinedBulkInsertPartitioner) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT); this.preppedInputRecordRdd = preppedInputRecordRdd; this.userDefinedBulkInsertPartitioner = userDefinedBulkInsertPartitioner; @@ -60,4 +60,4 @@ public HoodieWriteMetadata> execute() { } } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java index 5c3b4ca22f845..381c115533897 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkDeleteHelper.java @@ -22,6 +22,7 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -42,13 +43,13 @@ import java.util.HashMap; /** - * A spark implementation of {@link AbstractDeleteHelper}. + * A spark implementation of {@link BaseDeleteHelper}. * * @param */ @SuppressWarnings("checkstyle:LineLength") public class SparkDeleteHelper extends - AbstractDeleteHelper>, JavaRDD, JavaRDD, R> { + BaseDeleteHelper>, JavaRDD, JavaRDD, R> { private SparkDeleteHelper() { } @@ -93,7 +94,7 @@ public HoodieWriteMetadata> execute(String instantTime, } JavaRDD> dedupedRecords = - dedupedKeys.map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload())); + dedupedKeys.map(key -> new HoodieAvroRecord(key, new EmptyHoodieRecordPayload())); Instant beginTag = Instant.now(); // perform index loop up to get existing location of records JavaRDD> taggedRecords = HoodieJavaRDD.getJavaRDD( diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwritePartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwritePartitioner.java index 75dfbda30b7fb..dd545d5262846 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwritePartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwritePartitioner.java @@ -22,6 +22,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.WorkloadProfile; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -44,7 +45,7 @@ public SparkInsertOverwritePartitioner(WorkloadProfile profile, HoodieEngineCont * Returns a list of small files in the given partition path. */ protected List getSmallFiles(String partitionPath) { - // for overwrite, we ignore all existing files. So dont consider any file to be smallFiles + // for overwrite, we ignore all existing files. So do not consider any file to be smallFiles return Collections.emptyList(); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkMergeHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkMergeHelper.java index 5e82dbd8c566d..e87c3ef5ba77e 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkMergeHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkMergeHelper.java @@ -25,7 +25,6 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.execution.SparkBoundedInMemoryExecutor; import org.apache.hudi.io.HoodieMergeHandle; import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; @@ -43,7 +42,7 @@ import java.io.IOException; import java.util.Iterator; -public class SparkMergeHelper extends AbstractMergeHelper>, +public class SparkMergeHelper extends BaseMergeHelper>, JavaRDD, JavaRDD> { private SparkMergeHelper() { @@ -90,13 +89,13 @@ public void runMerge(HoodieTable>, JavaRDD ThreadLocal encoderCache = new ThreadLocal<>(); ThreadLocal decoderCache = new ThreadLocal<>(); - wrapper = new SparkBoundedInMemoryExecutor(table.getConfig(), readerIterator, + wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), readerIterator, new UpdateHandler(mergeHandle), record -> { if (!externalSchemaTransformation) { return record; } return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record); - }); + }, table.getPreExecuteRunnable()); wrapper.execute(); } catch (Exception e) { throw new HoodieException(e); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java index f4eff44a26f3a..23dceb1382f34 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java @@ -20,6 +20,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -32,11 +33,11 @@ import scala.Tuple2; /** - * A spark implementation of {@link AbstractWriteHelper}. + * A spark implementation of {@link BaseWriteHelper}. * * @param */ -public class SparkWriteHelper extends AbstractWriteHelper>, +public class SparkWriteHelper extends BaseWriteHelper>, JavaRDD, JavaRDD, R> { private SparkWriteHelper() { } @@ -58,7 +59,7 @@ protected JavaRDD> tag(JavaRDD> dedupedRecords, @Override public JavaRDD> deduplicateRecords( - JavaRDD> records, HoodieIndex index, int parallelism) { + JavaRDD> records, HoodieIndex index, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); return records.mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); @@ -70,7 +71,7 @@ public JavaRDD> deduplicateRecords( T reducedData = (T) rec2.getData().preCombine(rec1.getData()); HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey(); - return new HoodieRecord(reducedKey, reducedData); + return new HoodieAvroRecord(reducedKey, reducedData); }, parallelism).map(Tuple2::_2); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java index 6729da72d65eb..c54c526253f0b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java @@ -26,6 +26,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.NumericUtils; @@ -100,11 +101,19 @@ public UpsertPartitioner(WorkloadProfile profile, HoodieEngineContext context, H private void assignUpdates(WorkloadProfile profile) { // each update location gets a partition - Set> partitionStatEntries = profile.getPartitionPathStatMap().entrySet(); + Set> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet(); for (Map.Entry partitionStat : partitionStatEntries) { + WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat()); for (Map.Entry> updateLocEntry : partitionStat.getValue().getUpdateLocationToCount().entrySet()) { addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey()); + if (profile.hasOutputWorkLoadStats()) { + HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey()); + outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue()); + } + } + if (profile.hasOutputWorkLoadStats()) { + profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats); } } } @@ -161,11 +170,12 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) for (String partitionPath : partitionPaths) { WorkloadStat pStat = profile.getWorkloadStat(partitionPath); + WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat()); if (pStat.getNumInserts() > 0) { List smallFiles = filterSmallFilesInClustering(partitionPathToPendingClusteringFileGroupsId.getOrDefault(partitionPath, Collections.emptySet()), - partitionSmallFilesMap.get(partitionPath)); + partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>())); this.smallFiles.addAll(smallFiles); @@ -189,6 +199,9 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId()); LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket); } + if (profile.hasOutputWorkLoadStats()) { + outputWorkloadStats.addInserts(smallFile.location, recordsToAppend); + } bucketNumbers.add(bucket); recordsPerBucket.add(recordsToAppend); totalUnassignedInserts -= recordsToAppend; @@ -218,6 +231,9 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) } BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath); bucketInfoMap.put(totalBuckets, bucketInfo); + if (profile.hasOutputWorkLoadStats()) { + outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1)); + } totalBuckets++; } } @@ -235,12 +251,20 @@ private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets); partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets); } + if (profile.hasOutputWorkLoadStats()) { + profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats); + } } } private Map> getSmallFilesForPartitions(List partitionPaths, HoodieEngineContext context) { JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); Map> partitionSmallFilesMap = new HashMap<>(); + + if (config.getParquetSmallFileLimit() <= 0) { + return partitionSmallFilesMap; + } + if (partitionPaths != null && partitionPaths.size() > 0) { context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions"); JavaRDD partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size()); @@ -297,6 +321,11 @@ public int numPartitions() { return totalBuckets; } + @Override + public int getNumPartitions() { + return totalBuckets; + } + @Override public int getPartition(Object key) { Tuple2> keyLocation = diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/AbstractSparkDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseSparkDeltaCommitActionExecutor.java similarity index 90% rename from hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/AbstractSparkDeltaCommitActionExecutor.java rename to hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseSparkDeltaCommitActionExecutor.java index 3b3edd3084572..222506e7bbb36 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/AbstractSparkDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/BaseSparkDeltaCommitActionExecutor.java @@ -43,19 +43,19 @@ import java.util.Iterator; import java.util.List; -public abstract class AbstractSparkDeltaCommitActionExecutor> +public abstract class BaseSparkDeltaCommitActionExecutor> extends BaseSparkCommitActionExecutor { - private static final Logger LOG = LogManager.getLogger(AbstractSparkDeltaCommitActionExecutor.class); + private static final Logger LOG = LogManager.getLogger(BaseSparkDeltaCommitActionExecutor.class); // UpsertPartitioner for MergeOnRead table type private SparkUpsertDeltaCommitPartitioner mergeOnReadUpsertPartitioner; - public AbstractSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, + public BaseSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType) { this(context, config, table, instantTime, operationType, Option.empty()); } - public AbstractSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, + public BaseSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, WriteOperationType operationType, Option> extraMetadata) { super(context, config, table, instantTime, operationType, extraMetadata); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java index 281304d957620..6f23e41773bbd 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertDeltaCommitActionExecutor.java @@ -18,8 +18,6 @@ package org.apache.hudi.table.action.deltacommit; -import java.util.Map; - import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.model.HoodieRecord; @@ -28,28 +26,30 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.BulkInsertPartitioner; - +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.SparkBulkInsertHelper; + import org.apache.spark.api.java.JavaRDD; +import java.util.Map; + public class SparkBulkInsertDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { private final JavaRDD> inputRecordsRDD; - private final Option> bulkInsertPartitioner; + private final Option>>> bulkInsertPartitioner; public SparkBulkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD> inputRecordsRDD, - Option> bulkInsertPartitioner) { + Option>>> bulkInsertPartitioner) { this(context, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty()); } public SparkBulkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD> inputRecordsRDD, - Option> bulkInsertPartitioner, + Option>>> bulkInsertPartitioner, Option> extraMetadata) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata); this.inputRecordsRDD = inputRecordsRDD; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertPreppedDeltaCommitActionExecutor.java index 21fc013af69c9..be5b903c7642d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertPreppedDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkBulkInsertPreppedDeltaCommitActionExecutor.java @@ -26,23 +26,23 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; -import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.BulkInsertPartitioner; - +import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.SparkBulkInsertHelper; + import org.apache.spark.api.java.JavaRDD; public class SparkBulkInsertPreppedDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { private final JavaRDD> preppedInputRecordRdd; - private final Option> bulkInsertPartitioner; + private final Option>>> bulkInsertPartitioner; public SparkBulkInsertPreppedDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD> preppedInputRecordRdd, - Option> bulkInsertPartitioner) { + Option>>> bulkInsertPartitioner) { super(context, config, table, instantTime, WriteOperationType.BULK_INSERT); this.preppedInputRecordRdd = preppedInputRecordRdd; this.bulkInsertPartitioner = bulkInsertPartitioner; @@ -61,4 +61,4 @@ public HoodieWriteMetadata> execute() { } } -} \ No newline at end of file +} diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkDeleteDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkDeleteDeltaCommitActionExecutor.java index 4fb6a90f90a41..7cff563571459 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkDeleteDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkDeleteDeltaCommitActionExecutor.java @@ -25,13 +25,13 @@ import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.SparkDeleteHelper; + import org.apache.spark.api.java.JavaRDD; public class SparkDeleteDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { private final JavaRDD keys; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java index 7dd91710d66e9..7e38823fc8838 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java @@ -25,13 +25,13 @@ import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.SparkWriteHelper; + import org.apache.spark.api.java.JavaRDD; public class SparkInsertDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { private final JavaRDD> inputRecordsRDD; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertPreppedDeltaCommitActionExecutor.java index 1f1e0165b494a..e401d9555e434 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertPreppedDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertPreppedDeltaCommitActionExecutor.java @@ -26,10 +26,11 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.spark.api.java.JavaRDD; public class SparkInsertPreppedDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { private final JavaRDD> preppedRecords; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java index c6f3901a352b8..c63be6289004d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java @@ -24,13 +24,13 @@ import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; - import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.SparkWriteHelper; + import org.apache.spark.api.java.JavaRDD; public class SparkUpsertDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { private JavaRDD> inputRecordsRDD; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java index 8dd3146f5161d..e498019c415d8 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitPartitioner.java @@ -97,6 +97,10 @@ private List getSmallFileCandidates(String partitionPath, HoodieInsta .collect(Collectors.toList()); } + if (config.getParquetSmallFileLimit() <= 0) { + return Collections.emptyList(); + } + // If we cannot index log files, then we choose the smallest parquet file in the partition and add inserts to // it. Doing this overtime for a partition, we ensure that we handle small file issues return table.getSliceView() diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertPreppedDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertPreppedDeltaCommitActionExecutor.java index 3509efa6bfa9f..f593fea779029 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertPreppedDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertPreppedDeltaCommitActionExecutor.java @@ -26,10 +26,11 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.spark.api.java.JavaRDD; public class SparkUpsertPreppedDeltaCommitActionExecutor> - extends AbstractSparkDeltaCommitActionExecutor { + extends BaseSparkDeltaCommitActionExecutor { private final JavaRDD> preppedRecords; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java index f943b701757ed..1a911d5b42bba 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/upgrade/SparkUpgradeDowngradeHelper.java @@ -28,7 +28,7 @@ /** * Spark upgrade and downgrade helper. */ -public class SparkUpgradeDowngradeHelper implements BaseUpgradeDowngradeHelper { +public class SparkUpgradeDowngradeHelper implements SupportsUpgradeDowngrade { private static final SparkUpgradeDowngradeHelper SINGLETON_INSTANCE = new SparkUpgradeDowngradeHelper(); diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala deleted file mode 100644 index f968cbe1c77bd..0000000000000 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import java.nio.ByteBuffer -import java.sql.{Date, Timestamp} -import java.time.Instant - -import org.apache.avro.Conversions.DecimalConversion -import org.apache.avro.LogicalTypes.{TimestampMicros, TimestampMillis} -import org.apache.avro.Schema.Type._ -import org.apache.avro.generic.GenericData.{Fixed, Record} -import org.apache.avro.generic.{GenericData, GenericFixed, GenericRecord} -import org.apache.avro.{LogicalTypes, Schema} - -import org.apache.spark.sql.Row -import org.apache.spark.sql.avro.SchemaConverters -import org.apache.spark.sql.catalyst.expressions.GenericRow -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.types._ - -import org.apache.hudi.AvroConversionUtils._ -import org.apache.hudi.exception.HoodieIncompatibleSchemaException - -import scala.collection.JavaConverters._ - -object AvroConversionHelper { - - private def createDecimal(decimal: java.math.BigDecimal, precision: Int, scale: Int): Decimal = { - if (precision <= Decimal.MAX_LONG_DIGITS) { - // Constructs a `Decimal` with an unscaled `Long` value if possible. - Decimal(decimal.unscaledValue().longValue(), precision, scale) - } else { - // Otherwise, resorts to an unscaled `BigInteger` instead. - Decimal(decimal, precision, scale) - } - } - - /** - * - * Returns a converter function to convert row in avro format to GenericRow of catalyst. - * - * @param sourceAvroSchema Source schema before conversion inferred from avro file by passed in - * by user. - * @param targetSqlType Target catalyst sql type after the conversion. - * @return returns a converter function to convert row in avro format to GenericRow of catalyst. - */ - def createConverterToRow(sourceAvroSchema: Schema, - targetSqlType: DataType): AnyRef => AnyRef = { - - def createConverter(avroSchema: Schema, sqlType: DataType, path: List[String]): AnyRef => AnyRef = { - val avroType = avroSchema.getType - (sqlType, avroType) match { - // Avro strings are in Utf8, so we have to call toString on them - case (StringType, STRING) | (StringType, ENUM) => - (item: AnyRef) => if (item == null) null else item.toString - // Byte arrays are reused by avro, so we have to make a copy of them. - case (IntegerType, INT) | (BooleanType, BOOLEAN) | (DoubleType, DOUBLE) | - (FloatType, FLOAT) | (LongType, LONG) => - identity - case (BinaryType, FIXED) => - (item: AnyRef) => - if (item == null) { - null - } else { - item.asInstanceOf[Fixed].bytes().clone() - } - case (BinaryType, BYTES) => - (item: AnyRef) => - if (item == null) { - null - } else { - val byteBuffer = item.asInstanceOf[ByteBuffer] - val bytes = new Array[Byte](byteBuffer.remaining) - byteBuffer.get(bytes) - bytes - } - case (d: DecimalType, FIXED) => - (item: AnyRef) => - if (item == null) { - null - } else { - val decimalConversion = new DecimalConversion - val bigDecimal = decimalConversion.fromFixed(item.asInstanceOf[GenericFixed], avroSchema, - LogicalTypes.decimal(d.precision, d.scale)) - createDecimal(bigDecimal, d.precision, d.scale) - } - case (d: DecimalType, BYTES) => - (item: AnyRef) => - if (item == null) { - null - } else { - val decimalConversion = new DecimalConversion - val bigDecimal = decimalConversion.fromBytes(item.asInstanceOf[ByteBuffer], avroSchema, - LogicalTypes.decimal(d.precision, d.scale)) - createDecimal(bigDecimal, d.precision, d.scale) - } - case (DateType, INT) => - (item: AnyRef) => - if (item == null) { - null - } else { - item match { - case integer: Integer => DateTimeUtils.toJavaDate(integer) - case _ => new Date(item.asInstanceOf[Long]) - } - } - case (TimestampType, LONG) => - (item: AnyRef) => - if (item == null) { - null - } else { - avroSchema.getLogicalType match { - case _: TimestampMillis => - new Timestamp(item.asInstanceOf[Long]) - case _: TimestampMicros => - new Timestamp(item.asInstanceOf[Long] / 1000) - case null => - new Timestamp(item.asInstanceOf[Long]) - case other => - throw new HoodieIncompatibleSchemaException( - s"Cannot convert Avro logical type $other to Catalyst Timestamp type.") - } - } - case (struct: StructType, RECORD) => - val length = struct.fields.length - val converters = new Array[AnyRef => AnyRef](length) - val avroFieldIndexes = new Array[Int](length) - var i = 0 - while (i < length) { - val sqlField = struct.fields(i) - val avroField = avroSchema.getField(sqlField.name) - if (avroField != null) { - val converter = createConverter(avroField.schema(), sqlField.dataType, - path :+ sqlField.name) - converters(i) = converter - avroFieldIndexes(i) = avroField.pos() - } else if (!sqlField.nullable) { - throw new HoodieIncompatibleSchemaException( - s"Cannot find non-nullable field ${sqlField.name} at path ${path.mkString(".")} " + - "in Avro schema\n" + - s"Source Avro schema: $sourceAvroSchema.\n" + - s"Target Catalyst type: $targetSqlType") - } - i += 1 - } - - (item: AnyRef) => { - if (item == null) { - null - } else { - val record = item.asInstanceOf[GenericRecord] - - val result = new Array[Any](length) - var i = 0 - while (i < converters.length) { - if (converters(i) != null) { - val converter = converters(i) - result(i) = converter(record.get(avroFieldIndexes(i))) - } - i += 1 - } - new GenericRow(result) - } - } - case (arrayType: ArrayType, ARRAY) => - val elementConverter = createConverter(avroSchema.getElementType, arrayType.elementType, - path) - val allowsNull = arrayType.containsNull - (item: AnyRef) => { - if (item == null) { - null - } else { - item.asInstanceOf[java.lang.Iterable[AnyRef]].asScala.map { element => - if (element == null && !allowsNull) { - throw new RuntimeException(s"Array value at path ${path.mkString(".")} is not " + - "allowed to be null") - } else { - elementConverter(element) - } - } - } - } - case (mapType: MapType, MAP) if mapType.keyType == StringType => - val valueConverter = createConverter(avroSchema.getValueType, mapType.valueType, path) - val allowsNull = mapType.valueContainsNull - (item: AnyRef) => { - if (item == null) { - null - } else { - item.asInstanceOf[java.util.Map[AnyRef, AnyRef]].asScala.map { x => - if (x._2 == null && !allowsNull) { - throw new RuntimeException(s"Map value at path ${path.mkString(".")} is not " + - "allowed to be null") - } else { - (x._1.toString, valueConverter(x._2)) - } - }.toMap - } - } - case (sqlType, UNION) => - if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) { - val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL) - if (remainingUnionTypes.size == 1) { - createConverter(remainingUnionTypes.head, sqlType, path) - } else { - createConverter(Schema.createUnion(remainingUnionTypes.asJava), sqlType, path) - } - } else avroSchema.getTypes.asScala.map(_.getType) match { - case Seq(_) => createConverter(avroSchema.getTypes.get(0), sqlType, path) - case Seq(a, b) if Set(a, b) == Set(INT, LONG) && sqlType == LongType => - (item: AnyRef) => { - item match { - case null => null - case l: java.lang.Long => l - case i: java.lang.Integer => new java.lang.Long(i.longValue()) - } - } - case Seq(a, b) if Set(a, b) == Set(FLOAT, DOUBLE) && sqlType == DoubleType => - (item: AnyRef) => { - item match { - case null => null - case d: java.lang.Double => d - case f: java.lang.Float => new java.lang.Double(f.doubleValue()) - } - } - case other => - sqlType match { - case t: StructType if t.fields.length == avroSchema.getTypes.size => - val fieldConverters = t.fields.zip(avroSchema.getTypes.asScala).map { - case (field, schema) => - createConverter(schema, field.dataType, path :+ field.name) - } - - (item: AnyRef) => - if (item == null) { - null - } else { - val i = GenericData.get().resolveUnion(avroSchema, item) - val converted = new Array[Any](fieldConverters.length) - converted(i) = fieldConverters(i)(item) - new GenericRow(converted) - } - case _ => throw new HoodieIncompatibleSchemaException( - s"Cannot convert Avro schema to catalyst type because schema at path " + - s"${path.mkString(".")} is not compatible " + - s"(avroType = $other, sqlType = $sqlType). \n" + - s"Source Avro schema: $sourceAvroSchema.\n" + - s"Target Catalyst type: $targetSqlType") - } - } - case (left, right) => - throw new HoodieIncompatibleSchemaException( - s"Cannot convert Avro schema to catalyst type because schema at path " + - s"${path.mkString(".")} is not compatible (avroType = $left, sqlType = $right). \n" + - s"Source Avro schema: $sourceAvroSchema.\n" + - s"Target Catalyst type: $targetSqlType") - } - } - - createConverter(sourceAvroSchema, targetSqlType, List.empty[String]) - } - - def createConverterToAvro(dataType: DataType, - structName: String, - recordNamespace: String): Any => Any = { - dataType match { - case BinaryType => (item: Any) => - item match { - case null => null - case bytes: Array[Byte] => ByteBuffer.wrap(bytes) - } - case IntegerType | LongType | - FloatType | DoubleType | StringType | BooleanType => identity - case ByteType => (item: Any) => - if (item == null) null else item.asInstanceOf[Byte].intValue - case ShortType => (item: Any) => - if (item == null) null else item.asInstanceOf[Short].intValue - case dec: DecimalType => - val schema = SchemaConverters.toAvroType(dec, nullable = false, structName, recordNamespace) - (item: Any) => { - Option(item).map { _ => - val bigDecimalValue = item.asInstanceOf[java.math.BigDecimal] - val decimalConversions = new DecimalConversion() - decimalConversions.toFixed(bigDecimalValue, schema, LogicalTypes.decimal(dec.precision, dec.scale)) - }.orNull - } - case TimestampType => (item: Any) => - if (item == null) { - null - } else { - val timestamp = item match { - case i: Instant => Timestamp.from(i) - case t: Timestamp => t - } - // Convert time to microseconds since spark-avro by default converts TimestampType to - // Avro Logical TimestampMicros - timestamp.getTime * 1000 - } - case DateType => (item: Any) => - Option(item).map(_.asInstanceOf[Date].toLocalDate.toEpochDay.toInt).orNull - case ArrayType(elementType, _) => - val elementConverter = createConverterToAvro( - elementType, - structName, - recordNamespace) - (item: Any) => { - if (item == null) { - null - } else { - val sourceArray = item.asInstanceOf[Seq[Any]] - val sourceArraySize = sourceArray.size - val targetList = new java.util.ArrayList[Any](sourceArraySize) - var idx = 0 - while (idx < sourceArraySize) { - targetList.add(elementConverter(sourceArray(idx))) - idx += 1 - } - targetList - } - } - case MapType(StringType, valueType, _) => - val valueConverter = createConverterToAvro( - valueType, - structName, - recordNamespace) - (item: Any) => { - if (item == null) { - null - } else { - val javaMap = new java.util.HashMap[String, Any]() - item.asInstanceOf[Map[String, Any]].foreach { case (key, value) => - javaMap.put(key, valueConverter(value)) - } - javaMap - } - } - case structType: StructType => - val schema: Schema = convertStructTypeToAvroSchema(structType, structName, recordNamespace) - val childNameSpace = if (recordNamespace != "") s"$recordNamespace.$structName" else structName - val fieldConverters = structType.fields.map(field => - createConverterToAvro( - field.dataType, - field.name, - childNameSpace)) - (item: Any) => { - if (item == null) { - null - } else { - val record = new Record(schema) - val convertersIterator = fieldConverters.iterator - val fieldNamesIterator = dataType.asInstanceOf[StructType].fieldNames.iterator - val rowIterator = item.asInstanceOf[Row].toSeq.iterator - - while (convertersIterator.hasNext && rowIterator.hasNext) { - val converter = convertersIterator.next() - record.put(fieldNamesIterator.next(), converter(rowIterator.next())) - } - record - } - } - } - } -} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala index 5b87fee14a1e2..62bcbf684b836 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala @@ -17,21 +17,105 @@ */ package org.apache.hudi - -import org.apache.avro.Schema -import org.apache.avro.JsonProperties +import org.apache.avro.Schema.Type import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder, IndexedRecord} +import org.apache.avro.{AvroRuntimeException, JsonProperties, Schema} +import org.apache.hudi.HoodieSparkUtils.sparkAdapter import org.apache.hudi.avro.HoodieAvroUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.avro.SchemaConverters -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Row, SparkSession} -import scala.collection.JavaConverters._ import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ object AvroConversionUtils { + /** + * Check the nullability of the input Avro type and resolve it when it is nullable. The first + * return value is a [[Boolean]] indicating if the input Avro type is nullable. The second + * return value is either provided Avro type if it's not nullable, or its resolved non-nullable part + * in case it is + */ + def resolveAvroTypeNullability(avroType: Schema): (Boolean, Schema) = { + if (avroType.getType == Type.UNION) { + val fields = avroType.getTypes.asScala + val actualType = fields.filter(_.getType != Type.NULL) + if (fields.length != 2 || actualType.length != 1) { + throw new AvroRuntimeException( + s"Unsupported Avro UNION type $avroType: Only UNION of a null type and a non-null " + + "type is supported") + } + (true, actualType.head) + } else { + (false, avroType) + } + } + + /** + * Creates converter to transform Avro payload into Spark's Catalyst one + * + * @param rootAvroType Avro [[Schema]] to be transformed from + * @param rootCatalystType Catalyst [[StructType]] to be transformed into + * @return converter accepting Avro payload and transforming it into a Catalyst one (in the form of [[InternalRow]]) + */ + def createAvroToInternalRowConverter(rootAvroType: Schema, rootCatalystType: StructType): GenericRecord => Option[InternalRow] = + record => sparkAdapter.createAvroDeserializer(rootAvroType, rootCatalystType) + .deserialize(record) + .map(_.asInstanceOf[InternalRow]) + + /** + * Creates converter to transform Catalyst payload into Avro one + * + * @param rootCatalystType Catalyst [[StructType]] to be transformed from + * @param rootAvroType Avro [[Schema]] to be transformed into + * @param nullable whether Avro record is nullable + * @return converter accepting Catalyst payload (in the form of [[InternalRow]]) and transforming it into an Avro one + */ + def createInternalRowToAvroConverter(rootCatalystType: StructType, rootAvroType: Schema, nullable: Boolean): InternalRow => GenericRecord = { + row => sparkAdapter.createAvroSerializer(rootCatalystType, rootAvroType, nullable) + .serialize(row) + .asInstanceOf[GenericRecord] + } + + /** + * @deprecated please use [[AvroConversionUtils.createAvroToInternalRowConverter]] + */ + @Deprecated + def createConverterToRow(sourceAvroSchema: Schema, + targetSqlType: StructType): GenericRecord => Row = { + val encoder = RowEncoder.apply(targetSqlType).resolveAndBind() + val serde = sparkAdapter.createSparkRowSerDe(encoder) + val converter = AvroConversionUtils.createAvroToInternalRowConverter(sourceAvroSchema, targetSqlType) + + avro => converter.apply(avro).map(serde.deserializeRow).get + } + + /** + * @deprecated please use [[AvroConversionUtils.createInternalRowToAvroConverter]] + */ + @Deprecated + def createConverterToAvro(sourceSqlType: StructType, + structName: String, + recordNamespace: String): Row => GenericRecord = { + val encoder = RowEncoder.apply(sourceSqlType).resolveAndBind() + val serde = sparkAdapter.createSparkRowSerDe(encoder) + val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(sourceSqlType, structName, recordNamespace) + val (nullable, _) = resolveAvroTypeNullability(avroSchema) + + val converter = AvroConversionUtils.createInternalRowToAvroConverter(sourceSqlType, avroSchema, nullable) + + row => converter.apply(serde.serializeRow(row)) + } + + /** + * Creates [[org.apache.spark.sql.DataFrame]] from the provided [[RDD]] of [[GenericRecord]]s + * + * TODO convert directly from GenericRecord into InternalRow instead + */ def createDataFrame(rdd: RDD[GenericRecord], schemaStr: String, ss: SparkSession): Dataset[Row] = { if (rdd.isEmpty()) { ss.emptyDataFrame @@ -41,8 +125,8 @@ object AvroConversionUtils { else { val schema = new Schema.Parser().parse(schemaStr) val dataType = convertAvroSchemaToStructType(schema) - val convertor = AvroConversionHelper.createConverterToRow(schema, dataType) - records.map { x => convertor(x).asInstanceOf[Row] } + val converter = createConverterToRow(schema, dataType) + records.map { r => converter(r) } } }, convertAvroSchemaToStructType(new Schema.Parser().parse(schemaStr))) } @@ -57,7 +141,7 @@ object AvroConversionUtils { * @param recordNamespace Avro record namespace. * @return Avro schema corresponding to given struct type. */ - def convertStructTypeToAvroSchema(structType: StructType, + def convertStructTypeToAvroSchema(structType: DataType, structName: String, recordNamespace: String): Schema = { getAvroSchemaWithDefaults(SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace)) @@ -117,7 +201,7 @@ object AvroConversionUtils { def buildAvroRecordBySchema(record: IndexedRecord, requiredSchema: Schema, - requiredPos: List[Int], + requiredPos: Seq[Int], recordBuilder: GenericRecordBuilder): GenericRecord = { val requiredFields = requiredSchema.getFields.asScala assert(requiredFields.length == requiredPos.length) diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala index 3e5402565c151..b288289ac82ec 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala @@ -18,13 +18,10 @@ package org.apache.hudi -import java.util.Properties - import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord - import org.apache.hadoop.fs.{FileSystem, Path} - +import org.apache.hudi.avro.HoodieAvroUtils.rewriteRecord import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.common.config.TypedProperties import org.apache.hudi.common.model.HoodieRecord @@ -32,18 +29,17 @@ import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.keygen.constant.KeyGeneratorOptions import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory import org.apache.hudi.keygen.{BaseKeyGenerator, CustomAvroKeyGenerator, CustomKeyGenerator, KeyGenerator} - import org.apache.spark.SPARK_VERSION import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal} import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex} -import org.apache.spark.sql.functions._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, SparkSession} -import scala.collection.JavaConverters.asScalaBufferConverter +import java.util.Properties +import scala.collection.JavaConverters._ object HoodieSparkUtils extends SparkAdapterSupport { @@ -53,8 +49,12 @@ object HoodieSparkUtils extends SparkAdapterSupport { def isSpark3_0: Boolean = SPARK_VERSION.startsWith("3.0") + def isSpark3_1: Boolean = SPARK_VERSION.startsWith("3.1") + def isSpark3_2: Boolean = SPARK_VERSION.startsWith("3.2") + def gteqSpark3_2: Boolean = SPARK_VERSION > "3.2" + def getMetaSchema: StructType = { StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => { StructField(col, StringType, nullable = true) @@ -123,46 +123,49 @@ object HoodieSparkUtils extends SparkAdapterSupport { new InMemoryFileIndex(sparkSession, globbedPaths, Map(), Option.empty, fileStatusCache) } - def createRdd(df: DataFrame, structName: String, recordNamespace: String, reconcileToLatestSchema: Boolean, latestTableSchema: - org.apache.hudi.common.util.Option[Schema] = org.apache.hudi.common.util.Option.empty()): RDD[GenericRecord] = { - val dfWriteSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, recordNamespace) - var writeSchema : Schema = null; - var toReconcileSchema : Schema = null; - if (reconcileToLatestSchema && latestTableSchema.isPresent) { - // if reconcileToLatestSchema is set to true and latestSchema is present, then try to leverage latestTableSchema. - // this code path will handle situations where records are serialized in odl schema, but callers wish to convert - // to Rdd[GenericRecord] using different schema(could be evolved schema or could be latest table schema) - writeSchema = dfWriteSchema - toReconcileSchema = latestTableSchema.get() - } else { - // there are paths where callers wish to use latestTableSchema to convert to Rdd[GenericRecords] and not use - // row's schema. So use latestTableSchema if present. if not available, fallback to using row's schema. - writeSchema = if (latestTableSchema.isPresent) { latestTableSchema.get()} else { dfWriteSchema} - } - createRddInternal(df, writeSchema, toReconcileSchema, structName, recordNamespace) + /** + * @deprecated please use other overload [[createRdd]] + */ + def createRdd(df: DataFrame, structName: String, recordNamespace: String, reconcileToLatestSchema: Boolean, + latestTableSchema: org.apache.hudi.common.util.Option[Schema] = org.apache.hudi.common.util.Option.empty()): RDD[GenericRecord] = { + val latestTableSchemaConverted = if (latestTableSchema.isPresent && reconcileToLatestSchema) Some(latestTableSchema.get()) else None + createRdd(df, structName, recordNamespace, latestTableSchemaConverted) } - def createRddInternal(df: DataFrame, writeSchema: Schema, latestTableSchema: Schema, structName: String, recordNamespace: String) - : RDD[GenericRecord] = { - // Use the write avro schema to derive the StructType which has the correct nullability information - val writeDataType = AvroConversionUtils.convertAvroSchemaToStructType(writeSchema) - val encoder = RowEncoder.apply(writeDataType).resolveAndBind() - val deserializer = sparkAdapter.createSparkRowSerDe(encoder) - // if records were serialized with old schema, but an evolved schema was passed in with latestTableSchema, we need - // latestTableSchema equivalent datatype to be passed in to AvroConversionHelper.createConverterToAvro() - val reconciledDataType = - if (latestTableSchema != null) AvroConversionUtils.convertAvroSchemaToStructType(latestTableSchema) else writeDataType - // Note: deserializer.deserializeRow(row) is not capable of handling evolved schema. i.e. if Row was serialized in - // old schema, but deserializer was created with an encoder with evolved schema, deserialization fails. - // Hence we always need to deserialize in the same schema as serialized schema. - df.queryExecution.toRdd.map(row => deserializer.deserializeRow(row)) - .mapPartitions { records => - if (records.isEmpty) Iterator.empty - else { - val convertor = AvroConversionHelper.createConverterToAvro(reconciledDataType, structName, recordNamespace) - records.map { x => convertor(x).asInstanceOf[GenericRecord] } - } + def createRdd(df: DataFrame, structName: String, recordNamespace: String, readerAvroSchemaOpt: Option[Schema]): RDD[GenericRecord] = { + val writerSchema = df.schema + val writerAvroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(writerSchema, structName, recordNamespace) + val readerAvroSchema = readerAvroSchemaOpt.getOrElse(writerAvroSchema) + // We check whether passed in reader schema is identical to writer schema to avoid costly serde loop of + // making Spark deserialize its internal representation [[InternalRow]] into [[Row]] for subsequent conversion + // (and back) + val sameSchema = writerAvroSchema.equals(readerAvroSchema) + val (nullable, _) = AvroConversionUtils.resolveAvroTypeNullability(writerAvroSchema) + + // NOTE: We have to serialize Avro schema, and then subsequently parse it on the executor node, since Spark + // serializer is not able to digest it + val readerAvroSchemaStr = readerAvroSchema.toString + val writerAvroSchemaStr = writerAvroSchema.toString + // NOTE: We're accessing toRdd here directly to avoid [[InternalRow]] to [[Row]] conversion + df.queryExecution.toRdd.mapPartitions { rows => + if (rows.isEmpty) { + Iterator.empty + } else { + val transform: GenericRecord => GenericRecord = + if (sameSchema) identity + else { + val readerAvroSchema = new Schema.Parser().parse(readerAvroSchemaStr) + rewriteRecord(_, readerAvroSchema) + } + + // Since caller might request to get records in a different ("evolved") schema, we will be rewriting from + // existing Writer's schema into Reader's (avro) schema + val writerAvroSchema = new Schema.Parser().parse(writerAvroSchemaStr) + val convert = AvroConversionUtils.createInternalRowToAvroConverter(writerSchema, writerAvroSchema, nullable = nullable) + + rows.map { ir => transform(convert(ir)) } } + } } def getDeserializer(structType: StructType) : SparkRowSerDe = { @@ -293,4 +296,30 @@ object HoodieSparkUtils extends SparkAdapterSupport { s"${tableSchema.fieldNames.mkString(",")}") AttributeReference(columnName, field.get.dataType, field.get.nullable)() } + + def getRequiredSchema(tableAvroSchema: Schema, requiredColumns: Array[String]): (Schema, StructType) = { + // First get the required avro-schema, then convert the avro-schema to spark schema. + val name2Fields = tableAvroSchema.getFields.asScala.map(f => f.name() -> f).toMap + // Here have to create a new Schema.Field object + // to prevent throwing exceptions like "org.apache.avro.AvroRuntimeException: Field already used". + val requiredFields = requiredColumns.map(c => name2Fields(c)) + .map(f => new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal(), f.order())).toList + val requiredAvroSchema = Schema.createRecord(tableAvroSchema.getName, tableAvroSchema.getDoc, + tableAvroSchema.getNamespace, tableAvroSchema.isError, requiredFields.asJava) + val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema) + (requiredAvroSchema, requiredStructSchema) + } + + def toAttribute(tableSchema: StructType): Seq[AttributeReference] = { + tableSchema.map { field => + AttributeReference(field.name, field.dataType, field.nullable, field.metadata)() + } + } + + def collectFieldIndexes(projectedSchema: StructType, originalSchema: StructType): Seq[Int] = { + val nameToIndex = originalSchema.fields.zipWithIndex.map{ case (field, index) => + field.name -> index + }.toMap + projectedSchema.map(field => nameToIndex(field.name)) + } } diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializerTrait.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializerTrait.scala new file mode 100644 index 0000000000000..5c3035304cee7 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializerTrait.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +/** + * Deserializes Avro payload into Catalyst object + * + * NOTE: This is low-level component operating on Spark internal data-types (comprising [[InternalRow]]). + * If you're looking to convert Avro into "deserialized" [[Row]] (comprised of Java native types), + * please check [[AvroConversionUtils]] + */ +trait HoodieAvroDeserializerTrait { + final def deserialize(data: Any): Option[Any] = + doDeserialize(data) match { + case opt: Option[_] => opt // As of Spark 3.1, this will return data wrapped with Option, so we fetch the data + case row => Some(row) // For other Spark versions, return the data as is + } + + protected def doDeserialize(data: Any): Any +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializerTrait.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializerTrait.scala new file mode 100644 index 0000000000000..159d8da74d2db --- /dev/null +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializerTrait.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +/** + * Serializes Catalyst payload into Avro object + * + * NOTE: This is low-level component operating on Spark internal data-types (comprising [[InternalRow]]). + * If you're looking to convert "deserialized" [[Row]] into Avro, please check [[AvroConversionUtils]] + */ +trait HoodieAvroSerializerTrait { + def serialize(catalystData: Any): Any +} diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala index 79c858e062519..32ed2b16ce639 100644 --- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala +++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/SparkAdapter.scala @@ -18,24 +18,42 @@ package org.apache.spark.sql.hudi +import org.apache.avro.Schema import org.apache.hudi.client.utils.SparkRowSerDe +import org.apache.spark.sql.avro.{HoodieAvroDeserializerTrait, HoodieAvroSerializerTrait} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan, SubqueryAlias} import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} -import org.apache.spark.sql.execution.datasources.SparkParsePartitionUtil +import org.apache.spark.sql.execution.datasources.{FilePartition, LogicalRelation, PartitionedFile, SparkParsePartitionUtil} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SparkSession} +import java.util.Locale + /** * An interface to adapter the difference between spark2 and spark3 * in some spark related class. */ trait SparkAdapter extends Serializable { + /** + * Creates instance of [[HoodieAvroSerializerTrait]] providing for ability to serialize + * Spark's [[InternalRow]] into Avro payloads + */ + def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializerTrait + + /** + * Creates instance of [[HoodieAvroDeserializerTrait]] providing for ability to deserialize + * Avro payloads into Spark's [[InternalRow]] + */ + def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializerTrait + /** * Create the SparkRowSerDe. */ @@ -92,4 +110,41 @@ trait SparkAdapter extends Serializable { * ParserInterface#parseMultipartIdentifier is supported since spark3, for spark2 this should not be called. */ def parseMultipartIdentifier(parser: ParserInterface, sqlText: String): Seq[String] + + /** + * Combine [[PartitionedFile]] to [[FilePartition]] according to `maxSplitBytes`. + */ + def getFilePartitions(sparkSession: SparkSession, partitionedFiles: Seq[PartitionedFile], + maxSplitBytes: Long): Seq[FilePartition] + + def isHoodieTable(table: LogicalPlan, spark: SparkSession): Boolean = { + tripAlias(table) match { + case LogicalRelation(_, _, Some(tbl), _) => isHoodieTable(tbl) + case relation: UnresolvedRelation => + isHoodieTable(toTableIdentifier(relation), spark) + case _=> false + } + } + + def isHoodieTable(map: java.util.Map[String, String]): Boolean = { + map.getOrDefault("provider", "").equals("hudi") + } + + def isHoodieTable(table: CatalogTable): Boolean = { + table.provider.map(_.toLowerCase(Locale.ROOT)).orNull == "hudi" + } + + def isHoodieTable(tableId: TableIdentifier, spark: SparkSession): Boolean = { + val table = spark.sessionState.catalog.getTableMetadata(tableId) + isHoodieTable(table) + } + + def tripAlias(plan: LogicalPlan): LogicalPlan = { + plan match { + case SubqueryAlias(_, relation: LogicalPlan) => + tripAlias(relation) + case other => + other + } + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java index 08960d97d8cb6..3b5393527fd79 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestClientRollback.java @@ -75,7 +75,7 @@ public void testSavepointAndRollback() throws Exception { HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()).build(); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { - HoodieTestDataGenerator.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); /** * Write 1 (only inserts) @@ -171,7 +171,7 @@ public void testSavepointAndRollback() throws Exception { } /** - * Test Cases for effects of rollbacking completed/inflight commits. + * Test Cases for effects of rolling back completed/inflight commits. */ @Test public void testRollbackCommit() throws Exception { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java index e629a76654780..3aeca0f275891 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java @@ -18,7 +18,6 @@ package org.apache.hudi.client; -import org.apache.hadoop.fs.Path; import org.apache.hudi.client.transaction.lock.InProcessLockProvider; import org.apache.hudi.common.config.LockConfiguration; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; @@ -38,7 +37,10 @@ import org.apache.hudi.config.HoodieLockConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieWriteConflictException; +import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.testutils.HoodieClientTestBase; + +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; @@ -54,6 +56,7 @@ import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.CyclicBarrier; +import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; @@ -64,6 +67,7 @@ import static org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY; import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -94,10 +98,11 @@ public void testHoodieClientBasicMultiWriter(HoodieTableType tableType) throws E } Properties properties = new Properties(); properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); - properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); HoodieWriteConfig writeConfig = getConfigBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .withAutoArchive(false).withAutoClean(false).build()) .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) // Timeline-server-based markers are not used for multi-writer tests .withMarkersType(MarkerType.DIRECT.name()) @@ -105,7 +110,7 @@ public void testHoodieClientBasicMultiWriter(HoodieTableType tableType) throws E .build()).withAutoCommit(false).withProperties(properties).build(); // Create the first commit - createCommitWithInserts(writeConfig, getHoodieWriteClient(writeConfig), "000", "001", 200); + createCommitWithInserts(writeConfig, getHoodieWriteClient(writeConfig), "000", "001", 200, true); final int threadCount = 2; final ExecutorService executors = Executors.newFixedThreadPool(2); @@ -182,9 +187,9 @@ public void testMultiWriterWithInsertsToDistinctPartitions(HoodieTableType table Properties properties = new Properties(); properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); - properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"3000"); - properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY,"3000"); - properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY,"20"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY, "3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "20"); HoodieWriteConfig cfg = getConfigBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder() @@ -257,7 +262,7 @@ private void testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType t Properties properties = new Properties(); properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath); - properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); // Disabling embedded timeline server, it doesn't work with multiwriter HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder().withAutoClean(false) @@ -276,7 +281,7 @@ private void testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType t // Create the first commit with inserts HoodieWriteConfig cfg = writeConfigBuilder.build(); SparkRDDWriteClient client = getHoodieWriteClient(cfg); - createCommitWithInserts(cfg, client, "000", "001", 200); + createCommitWithInserts(cfg, client, "000", "001", 200, true); validInstants.add("001"); // Create 2 commits with upserts createCommitWithUpserts(cfg, client, "001", "000", "002", 100); @@ -351,7 +356,7 @@ private void testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType t final int numRecords = 100; latchCountDownAndWait(runCountDownLatch, 30000); assertDoesNotThrow(() -> { - createCommitWithInserts(cfg, client1, "003", newCommitTime, numRecords); + createCommitWithInserts(cfg, client1, "003", newCommitTime, numRecords, true); validInstants.add("007"); }); }); @@ -360,8 +365,8 @@ private void testMultiWriterWithAsyncTableServicesWithConflict(HoodieTableType t latchCountDownAndWait(runCountDownLatch, 30000); if (tableType == HoodieTableType.MERGE_ON_READ) { assertDoesNotThrow(() -> { - JavaRDD writeStatusJavaRDD = (JavaRDD) client2.compact("005"); - client2.commitCompaction("005", writeStatusJavaRDD, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client2.compact("005"); + client2.commitCompaction("005", compactionMetadata.getCommitMetadata().get(), Option.empty()); validInstants.add("005"); }); } @@ -395,7 +400,7 @@ public void testHoodieClientMultiWriterWithClustering(HoodieTableType tableType) } Properties properties = new Properties(); properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); - properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) .withAutoClean(false).build()) @@ -411,7 +416,7 @@ public void testHoodieClientMultiWriterWithClustering(HoodieTableType tableType) .build(); // Create the first commit - createCommitWithInserts(cfg, getHoodieWriteClient(cfg), "000", "001", 200); + createCommitWithInserts(cfg, getHoodieWriteClient(cfg), "000", "001", 200, true); // Start another inflight commit String newCommitTime = "003"; int numRecords = 100; @@ -441,6 +446,133 @@ public void testHoodieClientMultiWriterWithClustering(HoodieTableType tableType) } } + @Test + public void testHoodieClientMultiWriterAutoCommitForConflict() throws Exception { + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "100"); + HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + // Timeline-server-based markers are not used for multi-writer tests + .withMarkersType(MarkerType.DIRECT.name()) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class) + .build()).withAutoCommit(true).withProperties(properties); + HoodieWriteConfig cfg = writeConfigBuilder.build(); + HoodieWriteConfig cfg2 = writeConfigBuilder.build(); + + // Create the first commit + createCommitWithInserts(cfg, getHoodieWriteClient(cfg), "000", "001", 5000, false); + // Start another inflight commit + String newCommitTime1 = "003"; + String newCommitTime2 = "004"; + SparkRDDWriteClient client1 = getHoodieWriteClient(cfg); + SparkRDDWriteClient client2 = getHoodieWriteClient(cfg2); + + List updates1 = dataGen.generateUpdates(newCommitTime1, 5000); + List updates2 = dataGen.generateUpdates(newCommitTime2, 5000); + + JavaRDD writeRecords1 = jsc.parallelize(updates1, 4); + JavaRDD writeRecords2 = jsc.parallelize(updates2, 4); + + runConcurrentAndAssert(writeRecords1, writeRecords2, client1, client2, SparkRDDWriteClient::upsert, true); + } + + private void runConcurrentAndAssert(JavaRDD writeRecords1, JavaRDD writeRecords2, + SparkRDDWriteClient client1, SparkRDDWriteClient client2, + Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + boolean assertForConflict) throws ExecutionException, InterruptedException { + + CountDownLatch runCountDownLatch = new CountDownLatch(2); + final ExecutorService executors = Executors.newFixedThreadPool(2); + String newCommitTime1 = "003"; + String newCommitTime2 = "004"; + + AtomicBoolean client1Succeeded = new AtomicBoolean(true); + AtomicBoolean client2Succeeded = new AtomicBoolean(true); + + Future future1 = executors.submit(() -> { + try { + ingestBatch(writeFn, client1, newCommitTime1, writeRecords1, runCountDownLatch); + } catch (IOException e) { + LOG.error("IOException thrown " + e.getMessage()); + } catch (InterruptedException e) { + LOG.error("Interrupted Exception thrown " + e.getMessage()); + } catch (Exception e) { + client1Succeeded.set(false); + } + } + ); + + Future future2 = executors.submit(() -> { + try { + ingestBatch(writeFn, client2, newCommitTime2, writeRecords2, runCountDownLatch); + } catch (IOException e) { + LOG.error("IOException thrown " + e.getMessage()); + } catch (InterruptedException e) { + LOG.error("Interrupted Exception thrown " + e.getMessage()); + } catch (Exception e) { + client2Succeeded.set(false); + } + } + ); + + future1.get(); + future2.get(); + if (assertForConflict) { + assertFalse(client1Succeeded.get() && client2Succeeded.get()); + assertTrue(client1Succeeded.get() || client2Succeeded.get()); + } else { + assertTrue(client2Succeeded.get() && client1Succeeded.get()); + } + } + + @Test + public void testHoodieClientMultiWriterAutoCommitNonConflict() throws Exception { + Properties properties = new Properties(); + properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "100"); + HoodieWriteConfig.Builder writeConfigBuilder = getConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .withAutoClean(false).build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL) + // Timeline-server-based markers are not used for multi-writer tests + .withMarkersType(MarkerType.DIRECT.name()) + .withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class) + .build()).withAutoCommit(true).withProperties(properties); + HoodieWriteConfig cfg = writeConfigBuilder.build(); + HoodieWriteConfig cfg2 = writeConfigBuilder.build(); + + // Create the first commit + createCommitWithInserts(cfg, getHoodieWriteClient(cfg), "000", "001", 200, false); + // Start another inflight commit + String newCommitTime1 = "003"; + String newCommitTime2 = "004"; + SparkRDDWriteClient client1 = getHoodieWriteClient(cfg); + SparkRDDWriteClient client2 = getHoodieWriteClient(cfg2); + + List updates1 = dataGen.generateInserts(newCommitTime1, 200); + List updates2 = dataGen.generateInserts(newCommitTime2, 200); + + JavaRDD writeRecords1 = jsc.parallelize(updates1, 1); + JavaRDD writeRecords2 = jsc.parallelize(updates2, 1); + + runConcurrentAndAssert(writeRecords1, writeRecords2, client1, client2, SparkRDDWriteClient::bulkInsert, false); + } + + private void ingestBatch(Function3, SparkRDDWriteClient, JavaRDD, String> writeFn, + SparkRDDWriteClient writeClient, String commitTime, JavaRDD records, + CountDownLatch countDownLatch) throws IOException, InterruptedException { + writeClient.startCommitWithTime(commitTime); + countDownLatch.countDown(); + countDownLatch.await(); + JavaRDD statusJavaRDD = writeFn.apply(writeClient, records, commitTime); + statusJavaRDD.collect(); + } + private void createCommitWithInsertsForPartition(HoodieWriteConfig cfg, SparkRDDWriteClient client, String prevCommitTime, String newCommitTime, int numRecords, String partition) throws Exception { @@ -450,11 +582,14 @@ private void createCommitWithInsertsForPartition(HoodieWriteConfig cfg, SparkRDD } private void createCommitWithInserts(HoodieWriteConfig cfg, SparkRDDWriteClient client, - String prevCommitTime, String newCommitTime, int numRecords) throws Exception { - // Finish first base commmit + String prevCommitTime, String newCommitTime, int numRecords, + boolean doCommit) throws Exception { + // Finish first base commit JavaRDD result = insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, SparkRDDWriteClient::bulkInsert, false, false, numRecords); - assertTrue(client.commit(newCommitTime, result), "Commit should succeed"); + if (doCommit) { + assertTrue(client.commit(newCommitTime, result), "Commit should succeed"); + } } private void createCommitWithUpserts(HoodieWriteConfig cfg, SparkRDDWriteClient client, String prevCommit, diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieReadClient.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieReadClient.java index 1cd7d6ee9947d..872a4a4215ffc 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieReadClient.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieReadClient.java @@ -18,6 +18,7 @@ package org.apache.hudi.client; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; @@ -209,7 +210,7 @@ private void testTagLocation(HoodieWriteConfig hoodieWriteConfig, // since they have been modified in the DAG JavaRDD recordRDD = jsc.parallelize(result.collect().stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) - .map(record -> new HoodieRecord(record.getKey(), null)).collect(Collectors.toList())); + .map(record -> new HoodieAvroRecord(record.getKey(), null)).collect(Collectors.toList())); // Should have 100 records in table (check using Index), all in locations marked at commit HoodieReadClient readClient = getHoodieReadClient(hoodieWriteConfig.getBasePath()); List taggedRecords = readClient.tagLocation(recordRDD).collect(); @@ -225,7 +226,7 @@ private void testTagLocation(HoodieWriteConfig hoodieWriteConfig, numRecords, 200, 2); recordRDD = jsc.parallelize(result.collect().stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream) - .map(record -> new HoodieRecord(record.getKey(), null)).collect(Collectors.toList())); + .map(record -> new HoodieAvroRecord(record.getKey(), null)).collect(Collectors.toList())); // Index should be able to locate all updates in correct locations. readClient = getHoodieReadClient(hoodieWriteConfig.getBasePath()); taggedRecords = readClient.tagLocation(recordRDD).collect(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java index 457b8b526aa04..df0fed027cec1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestMultiFS.java @@ -18,7 +18,6 @@ package org.apache.hudi.client; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieRecord; @@ -69,14 +68,9 @@ public void tearDown() throws Exception { } protected HoodieWriteConfig getHoodieWriteConfig(String basePath) { - return getHoodieWriteConfig(basePath, HoodieMetadataConfig.ENABLE.defaultValue()); - } - - protected HoodieWriteConfig getHoodieWriteConfig(String basePath, boolean enableMetadata) { return HoodieWriteConfig.newBuilder().withPath(basePath).withEmbeddedTimelineServerEnabled(true) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable(tableName) .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadata).build()) .build(); } @@ -84,21 +78,21 @@ protected HoodieWriteConfig getHoodieWriteConfig(String basePath, boolean enable public void readLocalWriteHDFS() throws Exception { // Initialize table and filesystem HoodieTableMetaClient.withPropertyBuilder() - .setTableType(tableType) - .setTableName(tableName) - .setPayloadClass(HoodieAvroPayload.class) - .initTable(hadoopConf, dfsBasePath); + .setTableType(tableType) + .setTableName(tableName) + .setPayloadClass(HoodieAvroPayload.class) + .initTable(hadoopConf, dfsBasePath); // Create write client to write some records in - HoodieWriteConfig cfg = getHoodieWriteConfig(dfsBasePath, false); - HoodieWriteConfig localConfig = getHoodieWriteConfig(tablePath, false); + HoodieWriteConfig cfg = getHoodieWriteConfig(dfsBasePath); + HoodieWriteConfig localConfig = getHoodieWriteConfig(tablePath); HoodieTableMetaClient.withPropertyBuilder() .setTableType(tableType) .setTableName(tableName) .setPayloadClass(HoodieAvroPayload.class) .setRecordKeyFields(localConfig.getProps().getProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())) - .setPartitionFields(localConfig.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())) + .setPartitionFields(localConfig.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())) .initTable(hadoopConf, tablePath); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java index dda396a135676..3fb454940bf5d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestTableSchemaEvolution.java @@ -19,6 +19,7 @@ package org.apache.hudi.client; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -146,7 +147,7 @@ public void testSchemaCompatibilityBasic() throws Exception { + TIP_NESTED_SCHEMA + EXTRA_FIELD_SCHEMA + EXTRA_FIELD_SCHEMA.replace("new_field", "new_new_field") + TRIP_SCHEMA_SUFFIX; assertTrue(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, multipleAddedFieldSchema), - "Multiple added fields with defauls are compatible"); + "Multiple added fields with defaults are compatible"); assertFalse(TableSchemaResolver.isSchemaCompatible(TRIP_EXAMPLE_SCHEMA, TRIP_SCHEMA_PREFIX + EXTRA_TYPE_SCHEMA + MAP_TYPE_SCHEMA @@ -204,7 +205,7 @@ public void testMORTable() throws Exception { final List failedRecords = generateInsertsWithSchema("004", numRecords, TRIP_EXAMPLE_SCHEMA_DEVOLVED); try { // We cannot use insertBatch directly here because we want to insert records - // with a devolved schema and insertBatch inserts records using the TRIP_EXMPLE_SCHEMA. + // with a devolved schema and insertBatch inserts records using the TRIP_EXAMPLE_SCHEMA. writeBatch(client, "005", "004", Option.empty(), "003", numRecords, (String s, Integer a) -> failedRecords, SparkRDDWriteClient::insert, false, 0, 0, 0, false); fail("Insert with devolved scheme should fail"); @@ -232,7 +233,7 @@ public void testMORTable() throws Exception { client = getHoodieWriteClient(hoodieEvolvedWriteConfig); // We cannot use insertBatch directly here because we want to insert records - // with a evolved schemaand insertBatch inserts records using the TRIP_EXMPLE_SCHEMA. + // with an evolved schema and insertBatch inserts records using the TRIP_EXAMPLE_SCHEMA. final List evolvedRecords = generateInsertsWithSchema("005", numRecords, TRIP_EXAMPLE_SCHEMA_EVOLVED); writeBatch(client, "005", "004", Option.empty(), initCommitTime, numRecords, (String s, Integer a) -> evolvedRecords, SparkRDDWriteClient::insert, false, 0, 0, 0, false); @@ -497,9 +498,9 @@ private List convertToSchema(List records, String sc HoodieKey key = r.getKey(); GenericRecord payload; try { - payload = (GenericRecord)r.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get(); + payload = (GenericRecord) ((HoodieAvroRecord) r).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get(); GenericRecord newPayload = HoodieAvroUtils.rewriteRecord(payload, newSchema); - return new HoodieRecord(key, new RawTripTestPayload(newPayload.toString(), key.getRecordKey(), key.getPartitionPath(), schemaStr)); + return new HoodieAvroRecord(key, new RawTripTestPayload(newPayload.toString(), key.getRecordKey(), key.getPartitionPath(), schemaStr)); } catch (IOException e) { throw new RuntimeException("Conversion to new schema failed"); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java index 00e65a67c08e7..70f5e9f3bfd1d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestUpdateSchemaEvolution.java @@ -19,6 +19,7 @@ package org.apache.hudi.client; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -82,7 +83,7 @@ private WriteStatus prepareFirstRecordCommit(List recordsStrs) throws IO for (String recordStr : recordsStrs) { RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); insertRecords - .add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + .add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } Map insertRecordMap = insertRecords.stream() .collect(Collectors.toMap(r -> r.getRecordKey(), Function.identity())); @@ -147,7 +148,7 @@ private List buildUpdateRecords(String recordStr, String insertFil List updateRecords = new ArrayList<>(); RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); HoodieRecord record = - new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange); + new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange); record.setCurrentLocation(new HoodieRecordLocation("101", insertFileId)); record.seal(); updateRecords.add(record); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java index e3db3914ada77..223625fe7e469 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedMetadata.java @@ -18,6 +18,7 @@ package org.apache.hudi.client.functional; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; @@ -31,20 +32,24 @@ import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableVersion; -import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -57,26 +62,25 @@ import org.apache.hudi.common.testutils.HoodieMetadataTestTable; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieLockConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.config.metrics.HoodieMetricsConfig; -import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; -import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.io.storage.HoodieHFileReader; import org.apache.hudi.metadata.FileSystemBackedTableMetadata; import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter; +import org.apache.hudi.metadata.HoodieMetadataMergedLogRecordReader; import org.apache.hudi.metadata.HoodieMetadataMetrics; import org.apache.hudi.metadata.HoodieMetadataPayload; import org.apache.hudi.metadata.HoodieTableMetadata; -import org.apache.hudi.metadata.HoodieTableMetadataKeyGenerator; import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; @@ -86,6 +90,7 @@ import org.apache.hudi.table.upgrade.UpgradeDowngrade; import org.apache.hudi.testutils.MetadataMergeWriteStatus; +import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FSDataOutputStream; @@ -96,6 +101,8 @@ import org.apache.hadoop.util.Time; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.MessageType; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Tag; @@ -135,8 +142,8 @@ import static org.apache.hudi.common.model.WriteOperationType.INSERT; import static org.apache.hudi.common.model.WriteOperationType.UPSERT; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; -import static org.apache.hudi.metadata.HoodieTableMetadata.METADATA_TABLE_NAME_SUFFIX; import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -185,6 +192,8 @@ public void testMetadataTableBootstrap(HoodieTableType tableType, boolean addRol // trigger couple of upserts doWriteOperation(testTable, "0000005"); doWriteOperation(testTable, "0000006"); + doWriteOperation(testTable, "0000007"); + doCleanAndValidate(testTable, "0000008", Arrays.asList("0000007")); validateMetadata(testTable, true); } @@ -216,7 +225,7 @@ public void testOnlyValidPartitionsAdded(HoodieTableType tableType) throws Excep testTable.doWriteOperation("0000003", UPSERT, emptyList(), asList("p1", "p2"), 1, true); syncTableMetadata(writeConfig); - List partitions = metadataWriter(writeConfig).metadata().getAllPartitionPaths(); + List partitions = metadataWriter(writeConfig).getTableMetadata().getAllPartitionPaths(); assertFalse(partitions.contains(nonPartitionDirectory), "Must not contain the non-partition " + nonPartitionDirectory); assertTrue(partitions.contains("p1"), "Must contain partition p1"); @@ -339,6 +348,7 @@ public void testInsertUpsertCluster(HoodieTableType tableType) throws Exception if (tableType == MERGE_ON_READ) { doCompaction(testTable, "0000004"); } + doCleanAndValidate(testTable, "0000005", Arrays.asList("0000001")); validateMetadata(testTable, emptyList(), true); } @@ -374,6 +384,31 @@ public void testMetadataTableServices() throws Exception { assertEquals(tableMetadata.getLatestCompactionTime().get(), "0000003001"); } + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testTableOperationsWithMetadataIndex(HoodieTableType tableType) throws Exception { + initPath(); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withIndexConfig(HoodieIndexConfig.newBuilder() + .bloomIndexBucketizedChecking(false) + .build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withMetadataIndexBloomFilter(true) + .withMetadataIndexBloomFilterFileGroups(4) + .withMetadataIndexColumnStats(true) + .withMetadataIndexBloomFilterFileGroups(2) + .withMetadataIndexForAllColumns(true) + .build()) + .build(); + init(tableType, writeConfig); + testTableOperationsForMetaIndexImpl(writeConfig); + } + + private void testTableOperationsForMetaIndexImpl(final HoodieWriteConfig writeConfig) throws Exception { + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + testTableOperationsImpl(engineContext, writeConfig); + } /** * Tests that virtual key configs are honored in base files after compaction in metadata table. @@ -508,6 +543,257 @@ public void testMetadataTableWithPendingCompaction(boolean simulateFailedCompact } } + /** + * Test arguments - Table type, populate meta fields, exclude key from payload. + */ + public static List testMetadataRecordKeyExcludeFromPayloadArgs() { + return asList( + Arguments.of(COPY_ON_WRITE, true), + Arguments.of(COPY_ON_WRITE, false), + Arguments.of(MERGE_ON_READ, true), + Arguments.of(MERGE_ON_READ, false) + ); + } + + /** + * 1. Verify metadata table records key deduplication feature. When record key + * deduplication is enabled, verify the metadata record payload on disk has empty key. + * Otherwise, verify the valid key. + * 2. Verify populate meta fields work irrespective of record key deduplication config. + * 3. Verify table services like compaction benefit from record key deduplication feature. + */ + @ParameterizedTest + @MethodSource("testMetadataRecordKeyExcludeFromPayloadArgs") + public void testMetadataRecordKeyExcludeFromPayload(final HoodieTableType tableType, final boolean enableMetaFields) throws Exception { + initPath(); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withPopulateMetaFields(enableMetaFields) + .withMaxNumDeltaCommitsBeforeCompaction(3) + .build()) + .build(); + init(tableType, writeConfig); + + // 2nd commit + doWriteOperation(testTable, "0000001", INSERT); + + final HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf) + .setBasePath(metadataTableBasePath) + .build(); + HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig); + metadataMetaClient.reloadActiveTimeline(); + final HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient); + + // Compaction has not yet kicked in. Verify all the log files + // for the metadata records persisted on disk as per the config. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000001", + enableMetaFields); + }, "Metadata table should have valid log files!"); + + // Verify no base file created yet. + assertThrows(IllegalStateException.class, () -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table, enableMetaFields); + }, "Metadata table should not have a base file yet!"); + + // 2 more commits + doWriteOperation(testTable, "0000002", UPSERT); + doWriteOperation(testTable, "0000004", UPSERT); + + // Compaction should be triggered by now. Let's verify the log files + // if any for the metadata records persisted on disk as per the config. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000002", + enableMetaFields); + }, "Metadata table should have valid log files!"); + + // Verify the base file created by the just completed compaction. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table, enableMetaFields); + }, "Metadata table should have a valid base file!"); + + // 2 more commits to trigger one more compaction, along with a clean + doWriteOperation(testTable, "0000005", UPSERT); + doClean(testTable, "0000006", Arrays.asList("0000004")); + doWriteOperation(testTable, "0000007", UPSERT); + + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "7", enableMetaFields); + }, "Metadata table should have valid log files!"); + + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table, enableMetaFields); + }, "Metadata table should have a valid base file!"); + + validateMetadata(testTable); + } + + /** + * Verify the metadata table log files for the record field correctness. On disk format + * should be based on meta fields and key deduplication config. And the in-memory merged + * records should all be materialized fully irrespective of the config. + * + * @param table - Hoodie metadata test table + * @param metadataMetaClient - Metadata meta client + * @param latestCommitTimestamp - Latest commit timestamp + * @param enableMetaFields - Enable meta fields for the table records + * @throws IOException + */ + private void verifyMetadataRecordKeyExcludeFromPayloadLogFiles(HoodieTable table, HoodieTableMetaClient metadataMetaClient, + String latestCommitTimestamp, + boolean enableMetaFields) throws IOException { + table.getHoodieView().sync(); + + // Compaction should not be triggered yet. Let's verify no base file + // and few log files available. + List fileSlices = table.getSliceView() + .getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList()); + if (fileSlices.isEmpty()) { + throw new IllegalStateException("LogFile slices are not available!"); + } + + // Verify the log files honor the key deduplication and virtual keys config + List logFiles = fileSlices.get(0).getLogFiles().map(logFile -> { + return logFile; + }).collect(Collectors.toList()); + + List logFilePaths = logFiles.stream().map(logFile -> { + return logFile.getPath().toString(); + }).collect(Collectors.toList()); + + // Verify the on-disk raw records before they get materialized + verifyMetadataRawRecords(table, logFiles, enableMetaFields); + + // Verify the in-memory materialized and merged records + verifyMetadataMergedRecords(metadataMetaClient, logFilePaths, latestCommitTimestamp, enableMetaFields); + } + + /** + * Verify the metadata table on-disk raw records. When populate meta fields is enabled, + * these records should have additional meta fields in the payload. When key deduplication + * is enabled, these records on the disk should have key in the payload as empty string. + * + * @param table + * @param logFiles - Metadata table log files to be verified + * @param enableMetaFields - Enable meta fields for records + * @throws IOException + */ + private void verifyMetadataRawRecords(HoodieTable table, List logFiles, boolean enableMetaFields) throws IOException { + for (HoodieLogFile logFile : logFiles) { + FileStatus[] fsStatus = fs.listStatus(logFile.getPath()); + MessageType writerSchemaMsg = TableSchemaResolver.readSchemaFromLogFile(fs, logFile.getPath()); + if (writerSchemaMsg == null) { + // not a data block + continue; + } + + Schema writerSchema = new AvroSchemaConverter().convert(writerSchemaMsg); + HoodieLogFormat.Reader logFileReader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema); + + while (logFileReader.hasNext()) { + HoodieLogBlock logBlock = logFileReader.next(); + if (logBlock instanceof HoodieDataBlock) { + try (ClosableIterator recordItr = ((HoodieDataBlock) logBlock).getRecordItr()) { + recordItr.forEachRemaining(indexRecord -> { + final GenericRecord record = (GenericRecord) indexRecord; + if (enableMetaFields) { + // Metadata table records should have meta fields! + assertNotNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + assertNotNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)); + } else { + // Metadata table records should not have meta fields! + assertNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)); + } + + final String key = String.valueOf(record.get(HoodieMetadataPayload.KEY_FIELD_NAME)); + assertFalse(key.isEmpty()); + if (enableMetaFields) { + assertTrue(key.equals(String.valueOf(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)))); + } + }); + } + } + } + } + } + + /** + * Verify the metadata table in-memory merged records. Irrespective of key deduplication + * config, the in-memory merged records should always have the key field in the record + * payload fully materialized. + * + * @param metadataMetaClient - Metadata table meta client + * @param logFilePaths - Metadata table log file paths + * @param latestCommitTimestamp + * @param enableMetaFields - Enable meta fields + */ + private void verifyMetadataMergedRecords(HoodieTableMetaClient metadataMetaClient, List logFilePaths, + String latestCommitTimestamp, boolean enableMetaFields) { + Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema()); + if (enableMetaFields) { + schema = HoodieAvroUtils.addMetadataFields(schema); + } + HoodieMetadataMergedLogRecordReader logRecordReader = HoodieMetadataMergedLogRecordReader.newBuilder() + .withFileSystem(metadataMetaClient.getFs()) + .withBasePath(metadataMetaClient.getBasePath()) + .withLogFilePaths(logFilePaths) + .withLatestInstantTime(latestCommitTimestamp) + .withPartition(MetadataPartitionType.FILES.getPartitionPath()) + .withReaderSchema(schema) + .withMaxMemorySizeInBytes(100000L) + .withBufferSize(4096) + .withSpillableMapBasePath(tempDir.toString()) + .withDiskMapType(ExternalSpillableMap.DiskMapType.BITCASK) + .build(); + + assertDoesNotThrow(() -> { + logRecordReader.scan(); + }, "Metadata log records materialization failed"); + + for (Map.Entry> entry : logRecordReader.getRecords().entrySet()) { + assertFalse(entry.getKey().isEmpty()); + assertFalse(entry.getValue().getRecordKey().isEmpty()); + assertEquals(entry.getKey(), entry.getValue().getRecordKey()); + } + } + + /** + * Verify metadata table base files for the records persisted based on the config. When + * the key deduplication is enabled, the records persisted on the disk in the base file + * should have key field in the payload as empty string. + * + * @param table - Metadata table + * @param enableMetaFields - Enable meta fields + */ + private void verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(HoodieTable table, boolean enableMetaFields) throws IOException { + table.getHoodieView().sync(); + List fileSlices = table.getSliceView() + .getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList()); + if (!fileSlices.get(0).getBaseFile().isPresent()) { + throw new IllegalStateException("Base file not available!"); + } + final HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get(); + + HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), + new Path(baseFile.getPath()), + new CacheConfig(context.getHadoopConf().get())); + List> records = hoodieHFileReader.readAllRecords(); + records.forEach(entry -> { + if (enableMetaFields) { + assertNotNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + } else { + assertNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + } + + final String keyInPayload = (String) ((GenericRecord) entry.getSecond()) + .get(HoodieMetadataPayload.KEY_FIELD_NAME); + assertFalse(keyInPayload.isEmpty()); + }); + } + /** * Test rollback of various table operations sync to Metadata Table correctly. */ @@ -804,10 +1090,20 @@ public void testMetadataPayloadSpuriousDeletes(boolean ignoreSpuriousDeletes) th public void testTableOperationsWithRestore(HoodieTableType tableType) throws Exception { init(tableType); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) + .withRollbackUsingMarkers(false).build(); + testTableOperationsImpl(engineContext, writeConfig); + } - try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, - getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).build())) { - + /** + * Test all major table operations with the given table, config and context. + * + * @param engineContext - Engine context + * @param writeConfig - Write config + * @throws IOException + */ + private void testTableOperationsImpl(HoodieSparkEngineContext engineContext, HoodieWriteConfig writeConfig) throws IOException { + try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) { // Write 1 (Bulk insert) String newCommitTime = "0000001"; List records = dataGen.generateInserts(newCommitTime, 20); @@ -892,7 +1188,8 @@ public void testMetadataMultiWriter() throws Exception { Properties properties = new Properties(); properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks"); - properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"3000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"1000"); + properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY,"20"); HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()) @@ -1254,7 +1551,7 @@ public void testUpgradeDowngrade() throws IOException { assertTrue(currentStatus.getModificationTime() > prevStatus.getModificationTime()); initMetaClient(); - assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.THREE.versionCode()); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.FOUR.versionCode()); assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath)); assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime()); @@ -1336,7 +1633,7 @@ public void testRollbackDuringUpgradeForDoubleLocking() throws IOException, Inte } initMetaClient(); - assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.THREE.versionCode()); + assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.FOUR.versionCode()); assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist"); FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath)); assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime()); @@ -1483,7 +1780,7 @@ public void testMetadataMetrics() throws Exception { assertTrue(metricsRegistry.getAllCounts().containsKey(HoodieMetadataMetrics.INITIALIZE_STR + ".count")); assertTrue(metricsRegistry.getAllCounts().containsKey(HoodieMetadataMetrics.INITIALIZE_STR + ".totalDuration")); assertTrue(metricsRegistry.getAllCounts().get(HoodieMetadataMetrics.INITIALIZE_STR + ".count") >= 1L); - final String prefix = MetadataPartitionType.FILES.partitionPath() + "."; + final String prefix = MetadataPartitionType.FILES.getPartitionPath() + "."; assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_COUNT_BASE_FILES)); assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_COUNT_LOG_FILES)); assertTrue(metricsRegistry.getAllCounts().containsKey(prefix + HoodieMetadataMetrics.STAT_TOTAL_BASE_FILE_SIZE)); @@ -1491,95 +1788,6 @@ public void testMetadataMetrics() throws Exception { } } - /** - * Fetching WriteConfig for metadata table from Data table's writeConfig is not trivial and the method is not public in source code. so, for now, - * using this method which mimics source code. - * @param writeConfig - * @return - */ - private HoodieWriteConfig getMetadataWriteConfig(HoodieWriteConfig writeConfig) { - int parallelism = writeConfig.getMetadataInsertParallelism(); - - int minCommitsToKeep = Math.max(writeConfig.getMetadataMinCommitsToKeep(), writeConfig.getMinCommitsToKeep()); - int maxCommitsToKeep = Math.max(writeConfig.getMetadataMaxCommitsToKeep(), writeConfig.getMaxCommitsToKeep()); - - // Create the write config for the metadata table by borrowing options from the main write config. - HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder() - .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) - .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() - .withConsistencyCheckEnabled(writeConfig.getConsistencyGuardConfig().isConsistencyCheckEnabled()) - .withInitialConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getInitialConsistencyCheckIntervalMs()) - .withMaxConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getMaxConsistencyCheckIntervalMs()) - .withMaxConsistencyChecks(writeConfig.getConsistencyGuardConfig().getMaxConsistencyChecks()) - .build()) - .withWriteConcurrencyMode(WriteConcurrencyMode.SINGLE_WRITER) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).withFileListingParallelism(writeConfig.getFileListingParallelism()).build()) - .withAutoCommit(true) - .withAvroSchemaValidate(true) - .withEmbeddedTimelineServerEnabled(false) - .withMarkersType(MarkerType.DIRECT.name()) - .withRollbackUsingMarkers(false) - .withPath(HoodieTableMetadata.getMetadataTableBasePath(writeConfig.getBasePath())) - .withSchema(HoodieMetadataRecord.getClassSchema().toString()) - .forTable(writeConfig.getTableName() + METADATA_TABLE_NAME_SUFFIX) - .withCompactionConfig(HoodieCompactionConfig.newBuilder() - .withAsyncClean(writeConfig.isMetadataAsyncClean()) - // we will trigger cleaning manually, to control the instant times - .withAutoClean(false) - .withCleanerParallelism(parallelism) - .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) - .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) - .retainCommits(writeConfig.getMetadataCleanerCommitsRetained()) - .archiveCommitsWith(minCommitsToKeep, maxCommitsToKeep) - // we will trigger compaction manually, to control the instant times - .withInlineCompaction(false) - .withMaxNumDeltaCommitsBeforeCompaction(writeConfig.getMetadataCompactDeltaCommitMax()).build()) - .withParallelism(parallelism, parallelism) - .withDeleteParallelism(parallelism) - .withRollbackParallelism(parallelism) - .withFinalizeWriteParallelism(parallelism) - .withAllowMultiWriteOnSameInstant(true) - .withKeyGenerator(HoodieTableMetadataKeyGenerator.class.getCanonicalName()) - .withPopulateMetaFields(writeConfig.getMetadataConfig().populateMetaFields()); - - // RecordKey properties are needed for the metadata table records - final Properties properties = new Properties(); - properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY); - properties.put("hoodie.datasource.write.recordkey.field", HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY); - builder.withProperties(properties); - - if (writeConfig.isMetricsOn()) { - builder.withMetricsConfig(HoodieMetricsConfig.newBuilder() - .withReporterType(writeConfig.getMetricsReporterType().toString()) - .withExecutorMetrics(writeConfig.isExecutorMetricsEnabled()) - .on(true).build()); - switch (writeConfig.getMetricsReporterType()) { - case GRAPHITE: - builder.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder() - .onGraphitePort(writeConfig.getGraphiteServerPort()) - .toGraphiteHost(writeConfig.getGraphiteServerHost()) - .usePrefix(writeConfig.getGraphiteMetricPrefix()).build()); - break; - case JMX: - builder.withMetricsJmxConfig(HoodieMetricsJmxConfig.newBuilder() - .onJmxPort(writeConfig.getJmxPort()) - .toJmxHost(writeConfig.getJmxHost()) - .build()); - break; - case DATADOG: - case PROMETHEUS: - case PROMETHEUS_PUSHGATEWAY: - case CONSOLE: - case INMEMORY: - case CLOUDWATCH: - break; - default: - throw new HoodieMetadataException("Unsupported Metrics Reporter type " + writeConfig.getMetricsReporterType()); - } - } - return builder.build(); - } - private void doPreBootstrapOperations(HoodieTestTable testTable) throws Exception { doPreBootstrapOperations(testTable, "0000001", "0000002"); } @@ -1765,7 +1973,10 @@ private void validateMetadata(SparkRDDWriteClient testClient) throws IOException // in the .hoodie folder. List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, HoodieTableMetadata.getMetadataTableBasePath(basePath), false, false); - assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size()); + assertEquals(metadataWriter.getEnabledPartitionTypes().size(), metadataTablePartitions.size()); + + final Map metadataEnabledPartitionTypes = new HashMap<>(); + metadataWriter.getEnabledPartitionTypes().forEach(e -> metadataEnabledPartitionTypes.put(e.getPartitionPath(), e)); // Metadata table should automatically compact and clean // versions are +1 as autoclean / compaction happens end of commits @@ -1773,10 +1984,13 @@ private void validateMetadata(SparkRDDWriteClient testClient) throws IOException HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline()); metadataTablePartitions.forEach(partition -> { List latestSlices = fsView.getLatestFileSlices(partition).collect(Collectors.toList()); - assertTrue(latestSlices.stream().map(FileSlice::getBaseFile).count() <= 1, "Should have a single latest base file"); - assertTrue(latestSlices.size() <= 1, "Should have a single latest file slice"); - assertTrue(latestSlices.size() <= numFileVersions, "Should limit file slice to " - + numFileVersions + " but was " + latestSlices.size()); + assertTrue(latestSlices.stream().map(FileSlice::getBaseFile).count() + <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest base file per file group"); + assertTrue(latestSlices.size() + <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest file slice per file group"); + assertTrue(latestSlices.size() + <= (numFileVersions * metadataEnabledPartitionTypes.get(partition).getFileGroupCount()), "Should limit file slice to " + + numFileVersions + " per file group, but was " + latestSlices.size()); }); LOG.info("Validation time=" + timer.endTimer()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java index d6f151e34255a..70f54b111980e 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieBackedTableMetadata.java @@ -18,30 +18,64 @@ package org.apache.hudi.client.functional; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.hfile.CacheConfig; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieDataBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.io.storage.HoodieHFileReader; import org.apache.hudi.metadata.HoodieBackedTableMetadata; +import org.apache.hudi.metadata.HoodieMetadataMergedLogRecordReader; +import org.apache.hudi.metadata.HoodieMetadataPayload; import org.apache.hudi.metadata.HoodieTableMetadataKeyGenerator; +import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hadoop.fs.FileStatus; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.MessageType; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import static org.apache.hudi.common.model.WriteOperationType.INSERT; +import static org.apache.hudi.common.model.WriteOperationType.UPSERT; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -123,4 +157,216 @@ public void testNotExistPartition(final HoodieTableType tableType) throws Except tableMetadata.getAllFilesInPartition(new Path(writeConfig.getBasePath() + "dummy")); assertEquals(allFilesInPartition.length, 0); } + + /** + * 1. Verify metadata table records key deduplication feature. When record key + * deduplication is enabled, verify the metadata record payload on disk has empty key. + * Otherwise, verify the valid key. + * 2. Verify populate meta fields work irrespective of record key deduplication config. + * 3. Verify table services like compaction benefit from record key deduplication feature. + */ + @ParameterizedTest + @EnumSource(HoodieTableType.class) + public void testMetadataRecordKeyExcludeFromPayload(final HoodieTableType tableType) throws Exception { + initPath(); + writeConfig = getWriteConfigBuilder(true, true, false) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .enable(true) + .withPopulateMetaFields(false) + .withMaxNumDeltaCommitsBeforeCompaction(3) + .build()) + .build(); + init(tableType, writeConfig); + + // 2nd commit + doWriteOperation(testTable, "0000001", INSERT); + + final HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder() + .setConf(hadoopConf) + .setBasePath(metadataTableBasePath) + .build(); + HoodieWriteConfig metadataTableWriteConfig = getMetadataWriteConfig(writeConfig); + metadataMetaClient.reloadActiveTimeline(); + final HoodieTable table = HoodieSparkTable.create(metadataTableWriteConfig, context, metadataMetaClient); + + // Compaction has not yet kicked in. Verify all the log files + // for the metadata records persisted on disk as per the config. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000001"); + }, "Metadata table should have valid log files!"); + + // Verify no base file created yet. + assertThrows(IllegalStateException.class, () -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table); + }, "Metadata table should not have a base file yet!"); + + // 2 more commits + doWriteOperation(testTable, "0000002", UPSERT); + doWriteOperation(testTable, "0000004", UPSERT); + + // Compaction should be triggered by now. Let's verify the log files + // if any for the metadata records persisted on disk as per the config. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "0000002"); + }, "Metadata table should have valid log files!"); + + // Verify the base file created by the just completed compaction. + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table); + }, "Metadata table should have a valid base file!"); + + // 2 more commits to trigger one more compaction, along with a clean + doWriteOperation(testTable, "0000005", UPSERT); + doClean(testTable, "0000006", Arrays.asList("0000004")); + doWriteOperation(testTable, "0000007", UPSERT); + + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadLogFiles(table, metadataMetaClient, "7"); + }, "Metadata table should have valid log files!"); + + assertDoesNotThrow(() -> { + verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(table); + }, "Metadata table should have a valid base file!"); + + validateMetadata(testTable); + } + + /** + * Verify the metadata table log files for the record field correctness. On disk format + * should be based on meta fields and key deduplication config. And the in-memory merged + * records should all be materialized fully irrespective of the config. + * + * @param table - Hoodie metadata test table + * @param metadataMetaClient - Metadata meta client + * @param latestCommitTimestamp - Latest commit timestamp + * @throws IOException + */ + private void verifyMetadataRecordKeyExcludeFromPayloadLogFiles(HoodieTable table, HoodieTableMetaClient metadataMetaClient, + String latestCommitTimestamp) throws IOException { + table.getHoodieView().sync(); + + // Compaction should not be triggered yet. Let's verify no base file + // and few log files available. + List fileSlices = table.getSliceView() + .getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList()); + if (fileSlices.isEmpty()) { + throw new IllegalStateException("LogFile slices are not available!"); + } + + // Verify the log files honor the key deduplication and virtual keys config + List logFiles = fileSlices.get(0).getLogFiles().map(logFile -> { + return logFile; + }).collect(Collectors.toList()); + + List logFilePaths = logFiles.stream().map(logFile -> { + return logFile.getPath().toString(); + }).collect(Collectors.toList()); + + // Verify the on-disk raw records before they get materialized + verifyMetadataRawRecords(table, logFiles); + + // Verify the in-memory materialized and merged records + verifyMetadataMergedRecords(metadataMetaClient, logFilePaths, latestCommitTimestamp); + } + + /** + * Verify the metadata table on-disk raw records. When populate meta fields is enabled, + * these records should have additional meta fields in the payload. When key deduplication + * is enabled, these records on the disk should have key in the payload as empty string. + * + * @param table + * @param logFiles - Metadata table log files to be verified + * @throws IOException + */ + private void verifyMetadataRawRecords(HoodieTable table, List logFiles) throws IOException { + for (HoodieLogFile logFile : logFiles) { + FileStatus[] fsStatus = fs.listStatus(logFile.getPath()); + MessageType writerSchemaMsg = TableSchemaResolver.readSchemaFromLogFile(fs, logFile.getPath()); + if (writerSchemaMsg == null) { + // not a data block + continue; + } + + Schema writerSchema = new AvroSchemaConverter().convert(writerSchemaMsg); + HoodieLogFormat.Reader logFileReader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema); + + while (logFileReader.hasNext()) { + HoodieLogBlock logBlock = logFileReader.next(); + if (logBlock instanceof HoodieDataBlock) { + try (ClosableIterator recordItr = ((HoodieDataBlock) logBlock).getRecordItr()) { + recordItr.forEachRemaining(indexRecord -> { + final GenericRecord record = (GenericRecord) indexRecord; + assertNull(record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)); + final String key = String.valueOf(record.get(HoodieMetadataPayload.KEY_FIELD_NAME)); + assertFalse(key.isEmpty()); + }); + } + } + } + } + } + + /** + * Verify the metadata table in-memory merged records. Irrespective of key deduplication + * config, the in-memory merged records should always have the key field in the record + * payload fully materialized. + * + * @param metadataMetaClient - Metadata table meta client + * @param logFilePaths - Metadata table log file paths + * @param latestCommitTimestamp - Latest commit timestamp + */ + private void verifyMetadataMergedRecords(HoodieTableMetaClient metadataMetaClient, List logFilePaths, String latestCommitTimestamp) { + Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema()); + HoodieMetadataMergedLogRecordReader logRecordReader = HoodieMetadataMergedLogRecordReader.newBuilder() + .withFileSystem(metadataMetaClient.getFs()) + .withBasePath(metadataMetaClient.getBasePath()) + .withLogFilePaths(logFilePaths) + .withLatestInstantTime(latestCommitTimestamp) + .withPartition(MetadataPartitionType.FILES.getPartitionPath()) + .withReaderSchema(schema) + .withMaxMemorySizeInBytes(100000L) + .withBufferSize(4096) + .withSpillableMapBasePath(tempDir.toString()) + .withDiskMapType(ExternalSpillableMap.DiskMapType.BITCASK) + .build(); + + assertDoesNotThrow(() -> { + logRecordReader.scan(); + }, "Metadata log records materialization failed"); + + for (Map.Entry> entry : logRecordReader.getRecords().entrySet()) { + assertFalse(entry.getKey().isEmpty()); + assertFalse(entry.getValue().getRecordKey().isEmpty()); + assertEquals(entry.getKey(), entry.getValue().getRecordKey()); + } + } + + /** + * Verify metadata table base files for the records persisted based on the config. When + * the key deduplication is enabled, the records persisted on the disk in the base file + * should have key field in the payload as empty string. + * + * @param table - Metadata table + */ + private void verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(HoodieTable table) throws IOException { + table.getHoodieView().sync(); + List fileSlices = table.getSliceView() + .getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList()); + if (!fileSlices.get(0).getBaseFile().isPresent()) { + throw new IllegalStateException("Base file not available!"); + } + final HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get(); + + HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), + new Path(baseFile.getPath()), + new CacheConfig(context.getHadoopConf().get())); + List> records = hoodieHFileReader.readAllRecords(); + records.forEach(entry -> { + assertNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); + final String keyInPayload = (String) ((GenericRecord) entry.getSecond()) + .get(HoodieMetadataPayload.KEY_FIELD_NAME); + assertFalse(keyInPayload.isEmpty()); + }); + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java index 4466d4672bd3d..6ab33b422d0c1 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java @@ -21,7 +21,7 @@ import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.client.HoodieWriteResult; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.SparkTaskContextSupplier; @@ -36,6 +36,7 @@ import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; @@ -438,15 +439,15 @@ private void testDeduplication( String recordKey = UUID.randomUUID().toString(); HoodieKey keyOne = new HoodieKey(recordKey, "2018-01-01"); HoodieRecord recordOne = - new HoodieRecord(keyOne, dataGen.generateRandomValue(keyOne, newCommitTime)); + new HoodieAvroRecord(keyOne, dataGen.generateRandomValue(keyOne, newCommitTime)); HoodieKey keyTwo = new HoodieKey(recordKey, "2018-02-01"); HoodieRecord recordTwo = - new HoodieRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime)); + new HoodieAvroRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime)); // Same key and partition as keyTwo HoodieRecord recordThree = - new HoodieRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime)); + new HoodieAvroRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime)); JavaRDD> records = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1); @@ -575,6 +576,9 @@ private void testUpsertsInternal(HoodieWriteConfig config, HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion( TimelineLayoutVersion.CURR_VERSION).build(); client = getHoodieWriteClient(newConfig); + + client.savepoint("004", "user1","comment1"); + client.restoreToInstant("004"); assertFalse(metaClient.reloadActiveTimeline().getRollbackTimeline().lastInstant().isPresent()); @@ -687,7 +691,7 @@ public void testInsertsPreppedWithHoodieConcatHandle(boolean populateMetaFields) } /** - * Test one of HoodieConcatHandle w/ {@link AbstractHoodieWriteClient#insert(Object, String)} API. + * Test one of HoodieConcatHandle w/ {@link BaseHoodieWriteClient#insert(Object, String)} API. * * @param config Write Config * @throws Exception in case of error @@ -973,8 +977,8 @@ private void testUpsertsUpdatePartitionPath(IndexType indexType, HoodieWriteConf throw new IllegalStateException("Unknown partition path " + rec.getPartitionPath()); } recordsToUpsert.add( - new HoodieRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), - rec.getData())); + new HoodieAvroRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), + (HoodieRecordPayload) rec.getData())); // populate expected partition path and record keys expectedPartitionPathRecKeyPairs.add(Pair.of(newPartitionPath, rec.getRecordKey())); } @@ -1375,6 +1379,40 @@ public void testSimpleClustering(boolean populateMetaFields, boolean preserveCom testInsertAndClustering(clusteringConfig, populateMetaFields, true, false, SqlQueryEqualityPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, ""); } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testInlineScheduleClustering(boolean scheduleInlineClustering) throws IOException { + testInsertTwoBatches(true); + + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(false).withScheduleInlineClustering(scheduleInlineClustering) + .withPreserveHoodieCommitMetadata(true).build(); + + HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(false) + .withClusteringConfig(clusteringConfig) + .withProps(getPropertiesForKeyGen()).build(); + SparkRDDWriteClient client = getHoodieWriteClient(config); + dataGen = new HoodieTestDataGenerator(new String[] {"2015/03/16"}); + String commitTime1 = HoodieActiveTimeline.createNewInstantTime(); + List records1 = dataGen.generateInserts(commitTime1, 200); + client.startCommitWithTime(commitTime1); + JavaRDD insertRecordsRDD1 = jsc.parallelize(records1, 2); + JavaRDD statuses = client.upsert(insertRecordsRDD1, commitTime1); + List statusList = statuses.collect(); + assertNoWriteErrors(statusList); + client.commit(commitTime1, statuses); + + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build(); + List> pendingClusteringPlans = + ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList()); + if (scheduleInlineClustering) { + assertEquals(1, pendingClusteringPlans.size()); + } else { + assertEquals(0, pendingClusteringPlans.size()); + } + } + @ParameterizedTest @MethodSource("populateMetaFieldsAndPreserveMetadataParams") public void testClusteringWithSortColumns(boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception { @@ -1528,7 +1566,6 @@ private List testInsertAndClustering(HoodieClusteringConfig cluste Pair, List>, Set> allRecords = testInsertTwoBatches(populateMetaFields); testClustering(clusteringConfig, populateMetaFields, completeClustering, assertSameFileIds, validatorClasses, sqlQueryForEqualityValidation, sqlQueryForSingleResultValidation, allRecords); return allRecords.getLeft().getLeft(); - } /** diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java index 6cd25f3992259..2ff67c3c9156d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java @@ -19,9 +19,9 @@ package org.apache.hudi.client.functional; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -104,10 +104,10 @@ private static Stream indexTypeParams() { private HoodieWriteConfig config; private void setUp(IndexType indexType, boolean populateMetaFields) throws Exception { - setUp(indexType, populateMetaFields, true, true); + setUp(indexType, populateMetaFields, true); } - private void setUp(IndexType indexType, boolean populateMetaFields, boolean enableMetadata, boolean rollbackUsingMarkers) throws Exception { + private void setUp(IndexType indexType, boolean populateMetaFields, boolean rollbackUsingMarkers) throws Exception { this.indexType = indexType; initPath(); initSparkContexts(); @@ -121,8 +121,8 @@ private void setUp(IndexType indexType, boolean populateMetaFields, boolean enab config = getConfigBuilder() .withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen()) .withRollbackUsingMarkers(rollbackUsingMarkers) - .withIndexConfig(indexBuilder - .build()).withAutoCommit(false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadata).build()) + .withIndexConfig(indexBuilder.build()) + .withAutoCommit(false) .withLayoutConfig(HoodieLayoutConfig.newBuilder().fromProperties(indexBuilder.build().getProps()) .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build()).build(); writeClient = getHoodieWriteClient(config); @@ -237,7 +237,7 @@ public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean popul @ParameterizedTest @MethodSource("indexTypeParams") public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType, boolean populateMetaFields) throws Exception { - setUp(indexType, populateMetaFields, true, false); + setUp(indexType, populateMetaFields, false); String newCommitTime = writeClient.startCommit(); int totalRecords = 20 + random.nextInt(20); List records = dataGen.generateInserts(newCommitTime, totalRecords); @@ -309,16 +309,16 @@ public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); @@ -384,8 +384,6 @@ public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() thro .withGlobalSimpleIndexUpdatePartitionPath(true) .withBloomIndexUpdatePartitionPath(true) .build()) - .withMetadataConfig( - HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); writeClient = getHoodieWriteClient(config); index = writeClient.getIndex(); @@ -405,7 +403,7 @@ public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() thro RawTripTestPayload originalPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord originalRecord = - new HoodieRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), + new HoodieAvroRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), originalPayload); /* @@ -418,7 +416,7 @@ public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() thro RawTripTestPayload incomingPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-02-28T03:16:41.415Z\",\"number\":12}"); HoodieRecord incomingRecord = - new HoodieRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), + new HoodieAvroRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), incomingPayload); /* This record has the same record key as originalRecord and the same partition @@ -428,7 +426,7 @@ public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() thro RawTripTestPayload incomingPayloadSamePartition = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T04:16:41.415Z\",\"number\":15}"); HoodieRecord incomingRecordSamePartition = - new HoodieRecord( + new HoodieAvroRecord( new HoodieKey(incomingPayloadSamePartition.getRowKey(), incomingPayloadSamePartition.getPartitionPath()), incomingPayloadSamePartition); @@ -487,7 +485,7 @@ private HoodieWriteConfig.Builder getConfigBuilder() { private JavaPairRDD>> getRecordLocations(JavaRDD keyRDD, HoodieTable hoodieTable) { JavaRDD recordRDD = tagLocation( - index, keyRDD.map(k -> new HoodieRecord(k, new EmptyHoodieRecordPayload())), hoodieTable); + index, keyRDD.map(k -> new HoodieAvroRecord(k, new EmptyHoodieRecordPayload())), hoodieTable); return recordRDD.mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) : Option.empty()) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java index 56c9f016bcc6e..3141e1051ce5f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java @@ -18,32 +18,42 @@ package org.apache.hudi.client.functional; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; +import org.apache.hudi.common.model.HoodieCleaningPolicy; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.marker.MarkerType; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.testutils.HoodieMetadataTestTable; import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.metrics.HoodieMetricsConfig; import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; +import org.apache.hudi.config.metrics.HoodieMetricsJmxConfig; +import org.apache.hudi.exception.HoodieMetadataException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.SimpleKeyGenerator; +import org.apache.hudi.metadata.HoodieMetadataPayload; import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadataKeyGenerator; import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.HoodieTimelineArchiveLog; +import org.apache.hudi.client.HoodieTimelineArchiver; import org.apache.hudi.testutils.HoodieClientTestHarness; - -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.junit.jupiter.api.AfterEach; @@ -59,6 +69,7 @@ import static org.apache.hudi.common.model.WriteOperationType.INSERT; import static org.apache.hudi.common.model.WriteOperationType.UPSERT; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.apache.hudi.metadata.HoodieTableMetadata.METADATA_TABLE_NAME_SUFFIX; public class TestHoodieMetadataBase extends HoodieClientTestHarness { @@ -74,12 +85,22 @@ public void init(HoodieTableType tableType) throws IOException { init(tableType, true); } + public void init(HoodieTableType tableType, HoodieWriteConfig writeConfig) throws IOException { + init(tableType, Option.of(writeConfig), true, false, false, false); + } + public void init(HoodieTableType tableType, boolean enableMetadataTable) throws IOException { init(tableType, enableMetadataTable, true, false, false); } public void init(HoodieTableType tableType, boolean enableMetadataTable, boolean enableFullScan, boolean enableMetrics, boolean - validateMetadataPayloadStateConsistency) throws IOException { + validateMetadataPayloadStateConsistency) throws IOException { + init(tableType, Option.empty(), enableMetadataTable, enableFullScan, enableMetrics, + validateMetadataPayloadStateConsistency); + } + + public void init(HoodieTableType tableType, Option writeConfig, boolean enableMetadataTable, + boolean enableFullScan, boolean enableMetrics, boolean validateMetadataPayloadStateConsistency) throws IOException { this.tableType = tableType; initPath(); initSparkContexts("TestHoodieMetadata"); @@ -89,9 +110,12 @@ public void init(HoodieTableType tableType, boolean enableMetadataTable, boolean initMetaClient(tableType); initTestDataGenerator(); metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); - writeConfig = getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, enableMetadataTable, enableMetrics, - enableFullScan, true, validateMetadataPayloadStateConsistency).build(); - initWriteConfigAndMetatableWriter(writeConfig, enableMetadataTable); + this.writeConfig = writeConfig.isPresent() + ? writeConfig.get() : getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, + enableMetadataTable, enableMetrics, enableFullScan, true, + validateMetadataPayloadStateConsistency) + .build(); + initWriteConfigAndMetatableWriter(this.writeConfig, enableMetadataTable); } protected void initWriteConfigAndMetatableWriter(HoodieWriteConfig writeConfig, boolean enableMetadataTable) { @@ -262,8 +286,8 @@ protected void doPreBootstrapRestore(HoodieTestTable testTable, String restoreTi protected void archiveDataTable(HoodieWriteConfig writeConfig, HoodieTableMetaClient metaClient) throws IOException { HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(writeConfig, table); - archiveLog.archiveIfRequired(context); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + archiver.archiveIfRequired(context); } protected void validateMetadata(HoodieTestTable testTable) throws IOException { @@ -327,4 +351,91 @@ protected HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesClea .withProperties(properties); } + /** + * Fetching WriteConfig for metadata table from Data table's writeConfig is not trivial and + * the method is not public in source code. so, for now, using this method which mimics source code. + */ + protected HoodieWriteConfig getMetadataWriteConfig(HoodieWriteConfig writeConfig) { + int parallelism = writeConfig.getMetadataInsertParallelism(); + + int minCommitsToKeep = Math.max(writeConfig.getMetadataMinCommitsToKeep(), writeConfig.getMinCommitsToKeep()); + int maxCommitsToKeep = Math.max(writeConfig.getMetadataMaxCommitsToKeep(), writeConfig.getMaxCommitsToKeep()); + + // Create the write config for the metadata table by borrowing options from the main write config. + HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder() + .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder() + .withConsistencyCheckEnabled(writeConfig.getConsistencyGuardConfig().isConsistencyCheckEnabled()) + .withInitialConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getInitialConsistencyCheckIntervalMs()) + .withMaxConsistencyCheckIntervalMs(writeConfig.getConsistencyGuardConfig().getMaxConsistencyCheckIntervalMs()) + .withMaxConsistencyChecks(writeConfig.getConsistencyGuardConfig().getMaxConsistencyChecks()) + .build()) + .withWriteConcurrencyMode(WriteConcurrencyMode.SINGLE_WRITER) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).withFileListingParallelism(writeConfig.getFileListingParallelism()).build()) + .withAutoCommit(true) + .withAvroSchemaValidate(true) + .withEmbeddedTimelineServerEnabled(false) + .withMarkersType(MarkerType.DIRECT.name()) + .withRollbackUsingMarkers(false) + .withPath(HoodieTableMetadata.getMetadataTableBasePath(writeConfig.getBasePath())) + .withSchema(HoodieMetadataRecord.getClassSchema().toString()) + .forTable(writeConfig.getTableName() + METADATA_TABLE_NAME_SUFFIX) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withAsyncClean(writeConfig.isMetadataAsyncClean()) + // we will trigger cleaning manually, to control the instant times + .withAutoClean(false) + .withCleanerParallelism(parallelism) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY) + .retainCommits(writeConfig.getMetadataCleanerCommitsRetained()) + .archiveCommitsWith(minCommitsToKeep, maxCommitsToKeep) + // we will trigger compaction manually, to control the instant times + .withInlineCompaction(false) + .withMaxNumDeltaCommitsBeforeCompaction(writeConfig.getMetadataCompactDeltaCommitMax()).build()) + .withParallelism(parallelism, parallelism) + .withDeleteParallelism(parallelism) + .withRollbackParallelism(parallelism) + .withFinalizeWriteParallelism(parallelism) + .withAllowMultiWriteOnSameInstant(true) + .withKeyGenerator(HoodieTableMetadataKeyGenerator.class.getCanonicalName()) + .withPopulateMetaFields(writeConfig.getMetadataConfig().populateMetaFields()); + + // RecordKey properties are needed for the metadata table records + final Properties properties = new Properties(); + properties.put(HoodieTableConfig.RECORDKEY_FIELDS.key(), HoodieMetadataPayload.KEY_FIELD_NAME); + properties.put("hoodie.datasource.write.recordkey.field", HoodieMetadataPayload.KEY_FIELD_NAME); + builder.withProperties(properties); + + if (writeConfig.isMetricsOn()) { + builder.withMetricsConfig(HoodieMetricsConfig.newBuilder() + .withReporterType(writeConfig.getMetricsReporterType().toString()) + .withExecutorMetrics(writeConfig.isExecutorMetricsEnabled()) + .on(true).build()); + switch (writeConfig.getMetricsReporterType()) { + case GRAPHITE: + builder.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder() + .onGraphitePort(writeConfig.getGraphiteServerPort()) + .toGraphiteHost(writeConfig.getGraphiteServerHost()) + .usePrefix(writeConfig.getGraphiteMetricPrefix()).build()); + break; + case JMX: + builder.withMetricsJmxConfig(HoodieMetricsJmxConfig.newBuilder() + .onJmxPort(writeConfig.getJmxPort()) + .toJmxHost(writeConfig.getJmxHost()) + .build()); + break; + case DATADOG: + case PROMETHEUS: + case PROMETHEUS_PUSHGATEWAY: + case CONSOLE: + case INMEMORY: + case CLOUDWATCH: + break; + default: + throw new HoodieMetadataException("Unsupported Metrics Reporter type " + writeConfig.getMetricsReporterType()); + } + } + return builder.build(); + } + } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java index 057968f6f7ca5..bdbc9e72d3f4a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBootstrap.java @@ -278,7 +278,7 @@ private HoodieWriteConfig getWriteConfig(int minArchivalCommits, int maxArchival .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minArchivalCommits, maxArchivalCommits).build()) .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() .withRemoteServerPort(timelineServicePort).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .forTable("test-trip-table").build(); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestSparkBoundedInMemoryExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryExecutorInSpark.java similarity index 82% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestSparkBoundedInMemoryExecutor.java rename to hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryExecutorInSpark.java index ecb18c6bc2828..91f9cbc96e6ed 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestSparkBoundedInMemoryExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryExecutorInSpark.java @@ -22,12 +22,15 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.avro.generic.IndexedRecord; +import org.apache.spark.TaskContext; +import org.apache.spark.TaskContext$; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -44,7 +47,7 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -public class TestSparkBoundedInMemoryExecutor extends HoodieClientTestHarness { +public class TestBoundedInMemoryExecutorInSpark extends HoodieClientTestHarness { private final String instantTime = HoodieActiveTimeline.createNewInstantTime(); @@ -58,6 +61,11 @@ public void tearDown() throws Exception { cleanupResources(); } + private Runnable getPreExecuteRunnable() { + final TaskContext taskContext = TaskContext.get(); + return () -> TaskContext$.MODULE$.setTaskContext(taskContext); + } + @Test public void testExecutor() { @@ -85,10 +93,10 @@ protected Integer getResult() { } }; - SparkBoundedInMemoryExecutor>, Integer> executor = null; + BoundedInMemoryExecutor>, Integer> executor = null; try { - executor = new SparkBoundedInMemoryExecutor(hoodieWriteConfig, hoodieRecords.iterator(), consumer, - getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); + executor = new BoundedInMemoryExecutor(hoodieWriteConfig.getWriteBufferLimitBytes(), hoodieRecords.iterator(), consumer, + getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA), getPreExecuteRunnable()); int result = executor.execute(); // It should buffer and write 100 records assertEquals(100, result); @@ -131,11 +139,11 @@ protected Integer getResult() { } }; - SparkBoundedInMemoryExecutor>, Integer> executor = null; + BoundedInMemoryExecutor>, Integer> executor = null; try { - executor = new SparkBoundedInMemoryExecutor(hoodieWriteConfig, hoodieRecords.iterator(), consumer, - getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); - SparkBoundedInMemoryExecutor>, Integer> finalExecutor = executor; + executor = new BoundedInMemoryExecutor(hoodieWriteConfig.getWriteBufferLimitBytes(), hoodieRecords.iterator(), consumer, + getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA), getPreExecuteRunnable()); + BoundedInMemoryExecutor>, Integer> finalExecutor = executor; Thread.currentThread().interrupt(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryQueue.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryQueue.java index c30635bb12f9e..4707a68072e9a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryQueue.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/execution/TestBoundedInMemoryQueue.java @@ -18,6 +18,7 @@ package org.apache.hudi.execution; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; @@ -82,7 +83,7 @@ public void tearDown() throws Exception { public void testRecordReading() throws Exception { final int numRecords = 128; final List hoodieRecords = dataGen.generateInserts(instantTime, numRecords); - final BoundedInMemoryQueue> queue = + final BoundedInMemoryQueue queue = new BoundedInMemoryQueue(FileIOUtils.KB, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); // Produce Future resFuture = executorService.submit(() -> { @@ -93,7 +94,7 @@ public void testRecordReading() throws Exception { final Iterator originalRecordIterator = hoodieRecords.iterator(); int recordsRead = 0; while (queue.iterator().hasNext()) { - final HoodieRecord originalRecord = originalRecordIterator.next(); + final HoodieAvroRecord originalRecord = (HoodieAvroRecord) originalRecordIterator.next(); final Option originalInsertValue = originalRecord.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA); final HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = queue.iterator().next(); @@ -101,7 +102,7 @@ public void testRecordReading() throws Exception { assertEquals(originalRecord, payload.record); // cached insert value matches the expected insert value. assertEquals(originalInsertValue, - payload.record.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA)); + ((HoodieAvroRecord) payload.record).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA)); recordsRead++; } assertFalse(queue.iterator().hasNext() || originalRecordIterator.hasNext()); @@ -122,7 +123,7 @@ public void testCompositeProducerRecordReading() throws Exception { final int numProducers = 40; final List> recs = new ArrayList<>(); - final BoundedInMemoryQueue> queue = + final BoundedInMemoryQueue queue = new BoundedInMemoryQueue(FileIOUtils.KB, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); // Record Key to @@ -188,7 +189,7 @@ public void testCompositeProducerRecordReading() throws Exception { // Read recs and ensure we have covered all producer recs. while (queue.iterator().hasNext()) { - final HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = queue.iterator().next(); + final HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = queue.iterator().next(); final HoodieRecord rec = payload.record; Tuple2 producerPos = keyToProducerAndIndexMap.get(rec.getRecordKey()); Integer lastSeenPos = lastSeenMap.get(producerPos._1()); @@ -216,12 +217,12 @@ public void testMemoryLimitForBuffering() throws Exception { final List hoodieRecords = dataGen.generateInserts(instantTime, numRecords); // maximum number of records to keep in memory. final int recordLimit = 5; - final SizeEstimator> sizeEstimator = new DefaultSizeEstimator<>(); - HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = - getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA).apply(hoodieRecords.get(0)); + final SizeEstimator sizeEstimator = new DefaultSizeEstimator<>(); + HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = + getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA).apply((HoodieAvroRecord) hoodieRecords.get(0)); final long objSize = sizeEstimator.sizeEstimate(payload); final long memoryLimitInBytes = recordLimit * objSize; - final BoundedInMemoryQueue> queue = + final BoundedInMemoryQueue queue = new BoundedInMemoryQueue(memoryLimitInBytes, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA)); // Produce @@ -266,8 +267,8 @@ public void testException() throws Exception { final List hoodieRecords = dataGen.generateInserts(instantTime, numRecords); final SizeEstimator>> sizeEstimator = new DefaultSizeEstimator<>(); // queue memory limit - HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = - getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA).apply(hoodieRecords.get(0)); + HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = + getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA).apply((HoodieAvroRecord) hoodieRecords.get(0)); final long objSize = sizeEstimator.sizeEstimate(new Tuple2<>(payload.record, payload.insertValue)); final long memoryLimitInBytes = 4 * objSize; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/functional/SparkClientFunctionalTestSuite.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/functional/SparkClientFunctionalTestSuite.java index ee7427866feb8..5b20a51f5a2ed 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/functional/SparkClientFunctionalTestSuite.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/functional/SparkClientFunctionalTestSuite.java @@ -25,7 +25,10 @@ import org.junit.runner.RunWith; @RunWith(JUnitPlatform.class) -@SelectPackages({"org.apache.hudi.client.functional", "org.apache.hudi.table.functional"}) +@SelectPackages({ + "org.apache.hudi.client.functional", + "org.apache.hudi.table.functional", + "org.apache.hudi.index.hbase"}) @IncludeTags("functional") public class SparkClientFunctionalTestSuite { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java index 665e3a6a8e4a9..171403eb03847 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java @@ -19,16 +19,10 @@ package org.apache.hudi.index; -import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.engine.HoodieEngineContext; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.config.HoodieHBaseIndexConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.index.HoodieIndex.IndexType; import org.apache.hudi.index.bloom.HoodieBloomIndex; import org.apache.hudi.index.bloom.HoodieGlobalBloomIndex; @@ -36,9 +30,7 @@ import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex; import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex; import org.apache.hudi.index.simple.HoodieSimpleIndex; -import org.apache.hudi.table.HoodieTable; -import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -62,7 +54,7 @@ public void setUp(@TempDir Path tempDir) { @ParameterizedTest @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE", "HBASE", "BUCKET"}) - public void testCreateIndex(IndexType indexType) throws Exception { + public void testCreateIndex(IndexType indexType) { HoodieWriteConfig config; HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); @@ -104,15 +96,6 @@ public void testCreateIndex(IndexType indexType) throws Exception { } } - @Test - public void testCreateDummyIndex() { - HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); - HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); - HoodieWriteConfig config = clientConfigBuilder.withPath(basePath) - .withIndexConfig(indexConfigBuilder.withIndexClass(DummyHoodieIndex.class.getName()).build()).build(); - assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof DummyHoodieIndex); - } - @Test public void testCreateIndexWithException() { HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); @@ -132,47 +115,6 @@ public void testCreateIndexWithException() { assertTrue(thrown2.getMessage().contains("Unable to instantiate class")); } - public static class DummyHoodieIndex> extends SparkHoodieIndex { - - public DummyHoodieIndex(HoodieWriteConfig config) { - super(config); - } - - @Override - public JavaRDD updateLocation(JavaRDD writeStatusRDD, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException { - return null; - } - - @Override - public JavaRDD> tagLocation(JavaRDD> records, - HoodieEngineContext context, - HoodieTable>, JavaRDD, JavaRDD> hoodieTable) throws HoodieIndexException { - return null; - } - - @Override - public boolean rollbackCommit(String instantTime) { - return false; - } - - @Override - public boolean isGlobal() { - return false; - } - - @Override - public boolean canIndexLogFiles() { - return false; - } - - @Override - public boolean isImplicitWithStorage() { - return false; - } - } - public static class IndexWithConstructor { public IndexWithConstructor(HoodieWriteConfig config) { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java index 1334adb20d052..e61d6057cd80f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java @@ -18,9 +18,14 @@ package org.apache.hudi.index.bloom; +import org.apache.avro.Schema; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.client.functional.TestHoodieMetadataBase; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -32,14 +37,10 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaPairRDD; import org.apache.hudi.data.HoodieJavaRDD; -import org.apache.hudi.io.HoodieKeyLookupHandle; +import org.apache.hudi.index.HoodieIndexUtils; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; - -import org.apache.avro.Schema; -import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterEach; @@ -48,6 +49,7 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import scala.Tuple2; import java.nio.file.Paths; import java.util.Arrays; @@ -59,8 +61,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import scala.Tuple2; - import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -69,14 +69,14 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestHoodieBloomIndex extends HoodieClientTestHarness { +public class TestHoodieBloomIndex extends TestHoodieMetadataBase { private static final Schema SCHEMA = getSchemaFromResource(TestHoodieBloomIndex.class, "/exampleSchema.avsc", true); private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with rangePruning={0}, treeFiltering={1}, bucketizedChecking={2}"; public static Stream configParams() { Object[][] data = - new Object[][] {{true, true, true}, {false, true, true}, {true, true, false}, {true, false, true}}; + new Object[][]{{true, true, true}, {false, true, true}, {true, true, false}, {true, false, true}}; return Stream.of(data).map(Arguments::of); } @@ -99,6 +99,10 @@ private HoodieWriteConfig makeConfig(boolean rangePruning, boolean treeFiltering .withIndexConfig(HoodieIndexConfig.newBuilder().bloomIndexPruneByRanges(rangePruning) .bloomIndexTreebasedFilter(treeFiltering).bloomIndexBucketizedChecking(bucketizedChecking) .bloomIndexKeysPerBucket(2).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withMetadataIndexBloomFilter(false) + .withMetadataIndexColumnStats(false) + .build()) .build(); } @@ -119,22 +123,22 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); - List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + List> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); // Still 0, as no valid commit assertEquals(0, filesList.size()); @@ -143,7 +147,7 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b .withInserts("2015/03/12", "3", record1) .withInserts("2015/03/12", "4", record2, record3, record4); - filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); assertEquals(4, filesList.size()); if (rangePruning) { @@ -210,16 +214,16 @@ public void testCheckUUIDsAgainstOneFile() throws Exception { + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // We write record1, record2 to a parquet file, but the bloom filter contains (record1, // record2, record3). @@ -241,9 +245,9 @@ public void testCheckUUIDsAgainstOneFile() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient); - HoodieKeyLookupHandle keyHandle = new HoodieKeyLookupHandle<>(config, table, Pair.of(partition, fileId)); - List results = keyHandle.checkCandidatesAgainstFile(hadoopConf, uuids, - new Path(Paths.get(basePath, partition, filename).toString())); + List results = HoodieIndexUtils.filterKeysFromFile( + new Path(Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf); + assertEquals(results.size(), 2); assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); @@ -286,16 +290,16 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); // Also create the metadata and config @@ -352,15 +356,15 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean + "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()); - HoodieRecord record1 = new HoodieRecord(key1, rowChange1); + HoodieRecord record1 = new HoodieAvroRecord(key1, rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()); - HoodieRecord record2 = new HoodieRecord(key2, rowChange2); + HoodieRecord record2 = new HoodieAvroRecord(key2, rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()); - HoodieRecord record4 = new HoodieRecord(key4, rowChange4); + HoodieRecord record4 = new HoodieAvroRecord(key4, rowChange4); JavaRDD keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4)); // Also create the metadata and config @@ -371,7 +375,7 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean // Let's tag HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); JavaRDD taggedRecords = tagLocation( - bloomIndex, keysRDD.map(k -> new HoodieRecord(k, null)), hoodieTable); + bloomIndex, keysRDD.map(k -> new HoodieAvroRecord(k, null)), hoodieTable); JavaPairRDD>> recordLocationsRDD = taggedRecords .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) @@ -391,7 +395,7 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean // We do the tag again metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(config, context, metaClient); - taggedRecords = tagLocation(bloomIndex, keysRDD.map(k -> new HoodieRecord(k, null)), hoodieTable); + taggedRecords = tagLocation(bloomIndex, keysRDD.map(k -> new HoodieAvroRecord(k, null)), hoodieTable); recordLocationsRDD = taggedRecords .mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown() ? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId())) @@ -428,10 +432,10 @@ public void testBloomFilterFalseError(boolean rangePruning, boolean treeFilterin // We write record1 to a parquet file, using a bloom filter having both records RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java index fa7d586d2dc0a..9d25907b4bf9d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java @@ -19,10 +19,10 @@ package org.apache.hudi.index.bloom; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.RawTripTestPayload; -import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; @@ -92,24 +92,24 @@ public void testLoadInvolvedFiles() throws Exception { RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // intentionally missed the partition "2015/03/12" to see if the GlobalBloomIndex can pick it up List partitions = Arrays.asList("2016/01/21", "2016/04/01"); // partitions will NOT be respected by this loadInvolvedFiles(...) call - List> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + List> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); // Still 0, as no valid commit assertEquals(0, filesList.size()); @@ -118,7 +118,7 @@ public void testLoadInvolvedFiles() throws Exception { .withInserts("2015/03/12", "3", record1) .withInserts("2015/03/12", "4", record2, record3, record4); - filesList = index.loadInvolvedFiles(partitions, context, hoodieTable); + filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable); assertEquals(4, filesList.size()); Map filesMap = toFileMap(filesList); @@ -158,7 +158,7 @@ public void testExplodeRecordRDDWithFileComparisons() { jsc.parallelize(Arrays.asList(new Tuple2<>("2017/10/21", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/23", "004"))).mapToPair(t -> t); - List> comparisonKeyList = HoodieJavaRDD.getJavaRDD( + List> comparisonKeyList = HoodieJavaRDD.getJavaRDD( index.explodeRecordsWithFileComparisons(partitionToFileIndexInfo, HoodieJavaPairRDD.of(partitionRecordKeyPairRDD))).collect(); @@ -200,28 +200,28 @@ public void testTagLocation() throws Exception { RawTripTestPayload rowChange1 = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record1 = - new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = - new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record3 = - new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); // this record will be saved in table and will be tagged to the incoming record5 RawTripTestPayload rowChange4 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record4 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); // this has the same record key as record4 but different time so different partition, but globalbloomIndex should // tag the original partition of the saved record4 RawTripTestPayload rowChange5 = new RawTripTestPayload("{\"_row_key\":\"003\",\"time\":\"2016-02-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record5 = - new HoodieRecord(new HoodieKey(rowChange5.getRowKey(), rowChange5.getPartitionPath()), rowChange5); + new HoodieAvroRecord(new HoodieKey(rowChange5.getRowKey(), rowChange5.getPartitionPath()), rowChange5); JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record5)); @@ -281,7 +281,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { RawTripTestPayload originalPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord originalRecord = - new HoodieRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), + new HoodieAvroRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), originalPayload); /* @@ -294,7 +294,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { RawTripTestPayload incomingPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-02-28T03:16:41.415Z\",\"number\":12}"); HoodieRecord incomingRecord = - new HoodieRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), + new HoodieAvroRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), incomingPayload); /* @@ -305,7 +305,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { RawTripTestPayload incomingPayloadSamePartition = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T04:16:41.415Z\",\"number\":15}"); HoodieRecord incomingRecordSamePartition = - new HoodieRecord( + new HoodieAvroRecord( new HoodieKey(incomingPayloadSamePartition.getRowKey(), incomingPayloadSamePartition.getPartitionPath()), incomingPayloadSamePartition); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestKeyRangeLookupTree.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestKeyRangeLookupTree.java index 012d0dfa35910..1c6973db746bc 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestKeyRangeLookupTree.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestKeyRangeLookupTree.java @@ -80,7 +80,7 @@ public void testFileGroupLookUpManyEntriesWithSameStartValue() { * Tests for many duplicate entries in the tree. */ @Test - public void testFileGroupLookUpManyDulicateEntries() { + public void testFileGroupLookUpManyDuplicateEntries() { KeyRangeNode toInsert = new KeyRangeNode(Long.toString(1200), Long.toString(2000), UUID.randomUUID().toString()); updateExpectedMatchesToTest(toInsert); keyRangeLookupTree.insert(toInsert); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java index 879d9933978a0..4491a74fa62ba 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java @@ -18,16 +18,18 @@ package org.apache.hudi.index.bucket; -import java.util.Arrays; -import java.util.List; - -import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.keygen.KeyGenUtils; import org.apache.hudi.testutils.KeyGeneratorTestUtilities; + +import org.apache.avro.generic.GenericRecord; import org.junit.jupiter.api.Test; +import java.util.Arrays; +import java.util.List; + public class TestBucketIdentifier { @Test @@ -44,7 +46,7 @@ public void testBucketIdWithSimpleRecordKey() { String recordKeyField = "_row_key"; String indexKeyField = "_row_key"; GenericRecord record = KeyGeneratorTestUtilities.getRecord(); - HoodieRecord hoodieRecord = new HoodieRecord( + HoodieRecord hoodieRecord = new HoodieAvroRecord( new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null); int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8); assert bucketId == BucketIdentifier.getBucketId( @@ -56,7 +58,7 @@ public void testBucketIdWithComplexRecordKey() { List recordKeyField = Arrays.asList("_row_key","ts_ms"); String indexKeyField = "_row_key"; GenericRecord record = KeyGeneratorTestUtilities.getRecord(); - HoodieRecord hoodieRecord = new HoodieRecord( + HoodieRecord hoodieRecord = new HoodieAvroRecord( new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null); int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8); assert bucketId == BucketIdentifier.getBucketId( diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieBucketIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieBucketIndex.java index c79f9aec773ed..2b3765948bb63 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieBucketIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieBucketIndex.java @@ -19,8 +19,8 @@ package org.apache.hudi.index.bucket; -import org.apache.avro.Schema; import org.apache.hudi.common.data.HoodieData; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.RawTripTestPayload; @@ -34,6 +34,8 @@ import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; + +import org.apache.avro.Schema; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; @@ -46,8 +48,8 @@ import java.util.UUID; import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource; -import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; public class TestHoodieBucketIndex extends HoodieClientTestHarness { @@ -93,23 +95,23 @@ public void testTagLocation() throws Exception { String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - HoodieRecord record1 = new HoodieRecord( + HoodieRecord record1 = new HoodieAvroRecord( new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - HoodieRecord record2 = new HoodieRecord( + HoodieRecord record2 = new HoodieAvroRecord( new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); - HoodieRecord record3 = new HoodieRecord( + HoodieRecord record3 = new HoodieAvroRecord( new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); - HoodieRecord record4 = new HoodieRecord( + HoodieRecord record4 = new HoodieAvroRecord( new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); - JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); + JavaRDD> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); HoodieWriteConfig config = makeConfig(); HoodieTable table = HoodieSparkTable.create(config, context, metaClient); HoodieBucketIndex bucketIndex = new HoodieBucketIndex(config); - HoodieData taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, table); + HoodieData> taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, table); assertFalse(taggedRecordRDD.collectAsList().stream().anyMatch(r -> r.isCurrentLocationKnown())); HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(table, SCHEMA); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHBaseIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java similarity index 97% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHBaseIndex.java rename to hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java index b35fee0153103..87bcad04bc85e 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHBaseIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/hbase/TestSparkHoodieHBaseIndex.java @@ -16,15 +16,16 @@ * limitations under the License. */ -package org.apache.hudi.client.functional; +package org.apache.hudi.index.hbase; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -37,7 +38,6 @@ import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; @@ -93,7 +93,7 @@ */ @TestMethodOrder(MethodOrderer.Alphanumeric.class) @Tag("functional") -public class TestHBaseIndex extends SparkClientFunctionalTestHarness { +public class TestSparkHoodieHBaseIndex extends SparkClientFunctionalTestHarness { private static final String TABLE_NAME = "test_table"; private static HBaseTestingUtility utility; @@ -190,13 +190,13 @@ public void testTagLocationAndPartitionPathUpdate() throws Exception { final String newCommitTime = "001"; final int numRecords = 10; final String oldPartitionPath = "1970/01/01"; - final String emptyHoodieRecordPayloadClasssName = EmptyHoodieRecordPayload.class.getName(); + final String emptyHoodieRecordPayloadClassName = EmptyHoodieRecordPayload.class.getName(); List newRecords = dataGen.generateInserts(newCommitTime, numRecords); List oldRecords = new LinkedList(); for (HoodieRecord newRecord: newRecords) { HoodieKey key = new HoodieKey(newRecord.getRecordKey(), oldPartitionPath); - HoodieRecord hoodieRecord = new HoodieRecord(key, newRecord.getData()); + HoodieRecord hoodieRecord = new HoodieAvroRecord(key, (HoodieRecordPayload) newRecord.getData()); oldRecords.add(hoodieRecord); } @@ -225,12 +225,12 @@ public void testTagLocationAndPartitionPathUpdate() throws Exception { assertEquals(numRecords * 2L, taggedRecords.stream().count()); // Verify the number of deleted records assertEquals(numRecords, taggedRecords.stream().filter(record -> record.getKey().getPartitionPath().equals(oldPartitionPath) - && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClasssName)).count()); + && record.getData().getClass().getName().equals(emptyHoodieRecordPayloadClassName)).count()); // Verify the number of inserted records assertEquals(numRecords, taggedRecords.stream().filter(record -> !record.getKey().getPartitionPath().equals(oldPartitionPath)).count()); // not allowed path change test - index = new SparkHoodieHBaseIndex<>(getConfig(false, false)); + index = new SparkHoodieHBaseIndex(getConfig(false, false)); List notAllowPathChangeRecords = tagLocation(index, newWriteRecords, hoodieTable).collect(); assertEquals(numRecords, notAllowPathChangeRecords.stream().count()); assertEquals(numRecords, taggedRecords.stream().filter(hoodieRecord -> hoodieRecord.isCurrentLocationKnown() @@ -291,7 +291,7 @@ public void testTagLocationAndPartitionPathUpdateWithExplicitRollback() throws E List oldRecords = new LinkedList(); for (HoodieRecord newRecord: newRecords) { HoodieKey key = new HoodieKey(newRecord.getRecordKey(), oldPartitionPath); - HoodieRecord hoodieRecord = new HoodieRecord(key, newRecord.getData()); + HoodieRecord hoodieRecord = new HoodieAvroRecord(key, (HoodieRecordPayload) newRecord.getData()); oldRecords.add(hoodieRecord); } JavaRDD newWriteRecords = jsc().parallelize(newRecords, 1); @@ -341,8 +341,7 @@ public void testTagLocationAndPartitionPathUpdateWithExplicitRollback() throws E public void testSimpleTagLocationAndUpdateWithRollback() throws Exception { // Load to memory HoodieWriteConfig config = getConfigBuilder(100, false, false) - .withRollbackUsingMarkers(false) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build(); + .withRollbackUsingMarkers(false).build(); SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); SparkRDDWriteClient writeClient = getHoodieWriteClient(config); @@ -430,8 +429,7 @@ public void testSimpleTagLocationWithInvalidCommit() throws Exception { public void testEnsureTagLocationUsesCommitTimeline() throws Exception { // Load to memory HoodieWriteConfig config = getConfigBuilder(100, false, false) - .withRollbackUsingMarkers(false) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build(); + .withRollbackUsingMarkers(false).build(); SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config); SparkRDDWriteClient writeClient = getHoodieWriteClient(config); @@ -764,7 +762,7 @@ public void testDelete() throws Exception { // is not implemented via HoodieWriteClient JavaRDD deleteWriteStatues = writeStatues.map(w -> { WriteStatus newWriteStatus = new WriteStatus(true, 1.0); - w.getWrittenRecords().forEach(r -> newWriteStatus.markSuccess(new HoodieRecord(r.getKey(), null), Option.empty())); + w.getWrittenRecords().forEach(r -> newWriteStatus.markSuccess(new HoodieAvroRecord(r.getKey(), null), Option.empty())); assertEquals(w.getTotalRecords(), newWriteStatus.getTotalRecords()); newWriteStatus.setStat(new HoodieWriteStat()); return newWriteStatus; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java similarity index 89% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java rename to hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java index 4902d74264a09..652dbcb155b0e 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiver.java @@ -18,8 +18,8 @@ package org.apache.hudi.io; -import org.apache.hadoop.fs.FileStatus; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.client.HoodieTimelineArchiver; import org.apache.hudi.client.utils.MetadataConversionUtils; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.HoodieWrapperFileSystem; @@ -48,16 +48,17 @@ import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; -import org.apache.hudi.table.HoodieTimelineArchiveLog; import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; import org.junit.jupiter.params.provider.ValueSource; import java.io.IOException; @@ -72,14 +73,15 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.common.testutils.HoodieTestUtils.createCompactionCommitInMetadataTable; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness { +public class TestHoodieTimelineArchiver extends HoodieClientTestHarness { - private static final Logger LOG = LogManager.getLogger(TestHoodieTimelineArchiveLog.class); + private static final Logger LOG = LogManager.getLogger(TestHoodieTimelineArchiver.class); private Configuration hadoopConf; private HoodieWrapperFileSystem wrapperFs; @@ -172,8 +174,8 @@ public void testArchiveEmptyTable() throws Exception { .withParallelism(2, 2).forTable("test-trip-table").build(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - boolean result = archiveLog.archiveIfRequired(context); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + boolean result = archiver.archiveIfRequired(context); assertTrue(result); } @@ -213,7 +215,7 @@ public void testArchiveTableWithArchival(boolean enableMetadata) throws Exceptio @ParameterizedTest @ValueSource(booleans = {true, false}) public void testMergeSmallArchiveFilesRecoverFromBuildPlanFailed(boolean enableArchiveMerge) throws Exception { - HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(false, 2, 3, 2, enableArchiveMerge, 3, 209715200); + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); // do ingestion and trigger archive actions here. for (int i = 1; i < 8; i++) { @@ -224,14 +226,14 @@ public void testMergeSmallArchiveFilesRecoverFromBuildPlanFailed(boolean enableA // build a merge small archive plan with dummy content // this plan can not be deserialized. HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(writeConfig, table); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); FileStatus[] fsStatuses = metaClient.getFs().globStatus( new Path(metaClient.getArchivePath() + "/.commits_.archive*")); List candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList()); - archiveLog.reOpenWriter(); + archiver.reOpenWriter(); Path plan = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME); - archiveLog.buildArchiveMergePlan(candidateFiles, plan, ".commits_.archive.3_1-0-1"); + archiver.buildArchiveMergePlan(candidateFiles, plan, ".commits_.archive.3_1-0-1"); String s = "Dummy Content"; // stain the current merge plan file. FileIOUtils.createFileInPath(metaClient.getFs(), plan, Option.of(s.getBytes())); @@ -264,7 +266,7 @@ public void testMergeSmallArchiveFilesRecoverFromBuildPlanFailed(boolean enableA @ParameterizedTest @ValueSource(booleans = {true, false}) public void testMergeSmallArchiveFilesRecoverFromMergeFailed(boolean enableArchiveMerge) throws Exception { - HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(false, 2, 3, 2, enableArchiveMerge, 3, 209715200); + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); // do ingestion and trigger archive actions here. for (int i = 1; i < 8; i++) { @@ -274,15 +276,15 @@ public void testMergeSmallArchiveFilesRecoverFromMergeFailed(boolean enableArchi // do a single merge small archive files HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(writeConfig, table); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); FileStatus[] fsStatuses = metaClient.getFs().globStatus( new Path(metaClient.getArchivePath() + "/.commits_.archive*")); List candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList()); - archiveLog.reOpenWriter(); + archiver.reOpenWriter(); - archiveLog.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); - archiveLog.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); - HoodieLogFormat.Writer writer = archiveLog.reOpenWriter(); + archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); + archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); + HoodieLogFormat.Writer writer = archiver.reOpenWriter(); // check loading archived and active timeline success HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false); @@ -317,7 +319,7 @@ public void testMergeSmallArchiveFilesRecoverFromMergeFailed(boolean enableArchi @ParameterizedTest @ValueSource(booleans = {true, false}) public void testMergeSmallArchiveFilesRecoverFromDeleteFailed(boolean enableArchiveMerge) throws Exception { - HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(false, 2, 3, 2, enableArchiveMerge, 3, 209715200); + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); // do ingestion and trigger archive actions here. for (int i = 1; i < 8; i++) { @@ -327,16 +329,16 @@ public void testMergeSmallArchiveFilesRecoverFromDeleteFailed(boolean enableArch // do a single merge small archive files HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(writeConfig, table); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); FileStatus[] fsStatuses = metaClient.getFs().globStatus( new Path(metaClient.getArchivePath() + "/.commits_.archive*")); List candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList()); - archiveLog.reOpenWriter(); + archiver.reOpenWriter(); - archiveLog.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); - archiveLog.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); - archiveLog.reOpenWriter(); + archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); + archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); + archiver.reOpenWriter(); // delete only one of the small archive file to simulate delete action failed. metaClient.getFs().delete(fsStatuses[0].getPath()); @@ -362,7 +364,7 @@ public void testMergeSmallArchiveFilesRecoverFromDeleteFailed(boolean enableArch @ParameterizedTest @ValueSource(booleans = {true, false}) public void testLoadArchiveTimelineWithDamagedPlanFile(boolean enableArchiveMerge) throws Exception { - HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(false, 2, 3, 2, enableArchiveMerge, 3, 209715200); + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); // do ingestion and trigger archive actions here. for (int i = 1; i < 8; i++) { @@ -390,23 +392,23 @@ public void testLoadArchiveTimelineWithDamagedPlanFile(boolean enableArchiveMerg @ParameterizedTest @ValueSource(booleans = {true, false}) public void testLoadArchiveTimelineWithUncompletedMergeArchiveFile(boolean enableArchiveMerge) throws Exception { - HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(false, 2, 3, 2, enableArchiveMerge, 3, 209715200); + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200); for (int i = 1; i < 8; i++) { testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2); archiveAndGetCommitsList(writeConfig); } HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(writeConfig, table); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); FileStatus[] fsStatuses = metaClient.getFs().globStatus( new Path(metaClient.getArchivePath() + "/.commits_.archive*")); List candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList()); - archiveLog.reOpenWriter(); + archiver.reOpenWriter(); - archiveLog.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); - archiveLog.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); - HoodieLogFormat.Writer writer = archiveLog.reOpenWriter(); + archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1"); + archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList())); + HoodieLogFormat.Writer writer = archiver.reOpenWriter(); String s = "Dummy Content"; // stain the current merged archive file. @@ -451,15 +453,16 @@ public void testNoArchivalUntilMaxArchiveConfigWithExtraInflightCommits(boolean assertEquals(originalCommits, commitsAfterArchival); } - @Test - public void testArchiveCommitSavepointNoHole() throws Exception { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveCommitSavepointNoHole(boolean enableMetadataTable) throws Exception { init(); HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() .withRemoteServerPort(timelineServicePort).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) .build(); HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf()); @@ -470,11 +473,17 @@ public void testArchiveCommitSavepointNoHole() throws Exception { HoodieTestDataGenerator.createCommitFile(basePath, "104", wrapperFs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf()); HoodieTable table = HoodieSparkTable.create(cfg, context); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "105"); + } HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); - assertTrue(archiveLog.archiveIfRequired(context)); + assertTrue(archiver.archiveIfRequired(context)); timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); assertEquals(5, timeline.countInstants(), "Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)"); @@ -593,8 +602,9 @@ public void testNoArchivalWithInflightCompactionInMiddle(boolean enableMetadata) verifyArchival(archivedInstants, getActiveCommitInstants(Arrays.asList("00000007", "00000008"), HoodieTimeline.DELTA_COMMIT_ACTION), commitsAfterArchival); } - @Test - public void testArchiveCommitTimeline() throws Exception { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveCommitTimeline(boolean enableMetadataTable) throws Exception { init(); HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) @@ -602,7 +612,7 @@ public void testArchiveCommitTimeline() throws Exception { .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() .withRemoteServerPort(timelineServicePort).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) .build(); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -619,9 +629,15 @@ public void testArchiveCommitTimeline() throws Exception { HoodieTestDataGenerator.createCommitFile(basePath, "4", wrapperFs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "5", wrapperFs.getConf()); + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "5"); + } + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); - boolean result = archiveLog.archiveIfRequired(context); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); + boolean result = archiver.archiveIfRequired(context); assertTrue(result); HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); List archivedInstants = Arrays.asList(instant1, instant2, instant3); @@ -655,7 +671,8 @@ public void testConvertCommitMetadata() throws Exception { public void testArchiveTableWithCleanCommits(boolean enableMetadata) throws Exception { HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 2); - // min archival commits is 2 and max archival commits is 4(either clean commits has to be > 4 or commits has to be greater than 4. + // min archival commits is 2 and max archival commits is 4 + // (either clean commits has to be > 4 or commits has to be greater than 4) // and so, after 5th commit, 3 commits will be archived. // 1,2,3,4,5,6 : after archival -> 1,5,6 (because, 2,3,4,5 and 6 are clean commits and are eligible for archival) // after 7th and 8th commit no-op wrt archival. @@ -712,10 +729,9 @@ public void testArchiveTableWithCleanCommits(boolean enableMetadata) throws Exce @Test public void testArchiveRollbacksAndCleanTestTable() throws Exception { - boolean enableMetadata = false; int minArchiveCommits = 2; int maxArchiveCommits = 9; - HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, minArchiveCommits, maxArchiveCommits, 2); + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, minArchiveCommits, maxArchiveCommits, 2); // trigger 1 commit to add lot of files so that future cleans can clean them up testTable.doWriteOperation("00000001", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), Arrays.asList("p1", "p2"), 20); @@ -750,8 +766,8 @@ public void testArchiveRollbacksAndCleanTestTable() throws Exception { } @ParameterizedTest - @ValueSource(booleans = {true, false}) - public void testArchiveCompletedRollbackAndClean(boolean isEmpty) throws Exception { + @CsvSource({"true,true", "true,false", "false,true", "false,false"}) + public void testArchiveCompletedRollbackAndClean(boolean isEmpty, boolean enableMetadataTable) throws Exception { init(); int minInstantsToKeep = 2; int maxInstantsToKeep = 10; @@ -761,7 +777,7 @@ public void testArchiveCompletedRollbackAndClean(boolean isEmpty) throws Excepti .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minInstantsToKeep, maxInstantsToKeep).build()) .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() .withRemoteServerPort(timelineServicePort).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) .build(); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -774,10 +790,16 @@ public void testArchiveCompletedRollbackAndClean(boolean isEmpty) throws Excepti createCommitAndRollbackFile(startInstant + 1 + "", startInstant + "", false, isEmpty || i % 2 == 0); } + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, Integer.toString(99)); + } + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); - archiveLog.archiveIfRequired(context); + archiver.archiveIfRequired(context); Stream currentInstants = metaClient.getActiveTimeline().reload().getInstants(); Map> actionInstantMap = currentInstants.collect(Collectors.groupingBy(HoodieInstant::getAction)); @@ -789,8 +811,9 @@ public void testArchiveCompletedRollbackAndClean(boolean isEmpty) throws Excepti assertEquals(minInstantsToKeep, actionInstantMap.get("rollback").size(), "Should have min instant"); } - @Test - public void testArchiveInflightClean() throws Exception { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testArchiveInflightClean(boolean enableMetadataTable) throws Exception { init(); HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) @@ -798,7 +821,7 @@ public void testArchiveInflightClean() throws Exception { .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) .withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() .withRemoteServerPort(timelineServicePort).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) .build(); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -808,10 +831,16 @@ public void testArchiveInflightClean() throws Exception { HoodieInstant notArchivedInstant2 = createCleanMetadata("13", false); HoodieInstant notArchivedInstant3 = createCleanMetadata("14", true); + if (enableMetadataTable) { + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "14"); + } + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, table); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table); - archiveLog.archiveIfRequired(context); + archiver.archiveIfRequired(context); List notArchivedInstants = metaClient.getActiveTimeline().reload().getInstants().collect(Collectors.toList()); assertEquals(3, notArchivedInstants.size(), "Not archived instants should be 3"); @@ -888,13 +917,42 @@ public void testArchiveTableWithMetadataTableCompaction() throws Exception { "00000009", "00000010", "00000011", "00000012")), getActiveCommitInstants(Arrays.asList("00000013", "00000014")), commitsAfterArchival); } + @Test + public void testArchiveCommitsWithCompactionCommitInMetadataTableTimeline() throws Exception { + HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 4, 20); + int startInstantTime = 100; + int numCommits = 15; + int numExpectedArchived = 6; // "100" till "105" should be archived in this case + + for (int i = startInstantTime; i < startInstantTime + numCommits; i++) { + HoodieTestDataGenerator.createCommitFile(basePath, Integer.toString(i), wrapperFs.getConf()); + } + // Simulate a compaction commit in metadata table timeline + // so the archival in data table can happen + createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "105"); + + HoodieTable table = HoodieSparkTable.create(writeConfig, context); + HoodieTimelineArchiver archiveLog = new HoodieTimelineArchiver(writeConfig, table); + + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); + assertEquals(numCommits, timeline.countInstants(), String.format("Loaded %d commits and the count should match", numCommits)); + assertTrue(archiveLog.archiveIfRequired(context)); + timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); + assertEquals(numCommits - numExpectedArchived, timeline.countInstants(), + "Since we have a compaction commit of 105 in metadata table timeline, we should never archive any commit after that"); + for (int i = startInstantTime + numExpectedArchived; i < startInstantTime + numCommits; i++) { + assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, Integer.toString(i))), + String.format("Commit %d should not be archived", i)); + } + } + private Pair, List> archiveAndGetCommitsList(HoodieWriteConfig writeConfig) throws IOException { metaClient.reloadActiveTimeline(); HoodieTimeline timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); List originalCommits = timeline.getInstants().collect(Collectors.toList()); HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(writeConfig, table); - archiveLog.archiveIfRequired(context); + HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table); + archiver.archiveIfRequired(context); timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); List commitsAfterArchival = timeline.getInstants().collect(Collectors.toList()); return Pair.of(originalCommits, commitsAfterArchival); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java index 0f308425bc1c0..5a19f0afe9c65 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/storage/row/TestHoodieRowCreateHandle.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; +import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; @@ -36,8 +37,9 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -170,19 +172,29 @@ public void testGlobalFailure() throws Exception { assertRows(inputRows, result, instantTime, fileNames); } - @Test - public void testInstantiationFailure() throws IOException { + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testInstantiationFailure(boolean enableMetadataTable) { // init config and table HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) - .withPath("/dummypath/abc/").build(); - HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); + .withPath("/dummypath/abc/") + .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()) + .build(); try { + HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); fail("Should have thrown exception"); } catch (HoodieInsertException ioe) { - // expected + // expected without metadata table + if (enableMetadataTable) { + fail("Should have thrown TableNotFoundException"); + } + } catch (TableNotFoundException e) { + // expected with metadata table + if (!enableMetadataTable) { + fail("Should have thrown HoodieInsertException"); + } } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java index 8a1f4abd29cea..f51a169dd9b44 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestCleaner.java @@ -114,6 +114,7 @@ import scala.Tuple3; +import static org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename; import static org.apache.hudi.common.testutils.HoodieTestTable.makeIncrementalCommitTimes; import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime; import static org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS; @@ -121,6 +122,7 @@ import static org.awaitility.Awaitility.await; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -253,6 +255,73 @@ public void testBulkInsertPreppedAndCleanByVersions() throws Exception { SparkRDDWriteClient::upsertPreppedRecords, true); } + + /** + * Tests no more than 1 clean is scheduled/executed if HoodieCompactionConfig.allowMultipleCleanSchedule config is disabled. + */ + @Test + public void testMultiClean() { + HoodieWriteConfig writeConfig = getConfigBuilder() + .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() + .withEnableBackupForRemoteFileSystemView(false).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) + .allowMultipleCleans(false) + .withAutoClean(false).retainCommits(1).retainFileVersions(1).build()) + .withEmbeddedTimelineServerEnabled(false).build(); + + int index = 0; + String cleanInstantTime; + final String partition = "2015/03/16"; + try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) { + // Three writes so we can initiate a clean + for (; index < 3; ++index) { + String newCommitTime = "00" + index; + List records = dataGen.generateInsertsForPartition(newCommitTime, 1, partition); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + } + } + + // mimic failed/leftover clean by scheduling a clean but not performing it + cleanInstantTime = "00" + index++; + HoodieTable table = HoodieSparkTable.create(writeConfig, context); + Option cleanPlan = table.scheduleCleaning(context, cleanInstantTime, Option.empty()); + assertEquals(cleanPlan.get().getFilePathsToBeDeletedPerPartition().get(partition).size(), 1); + assertEquals(metaClient.reloadActiveTimeline().getCleanerTimeline().filterInflightsAndRequested().countInstants(), 1); + + try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) { + // Next commit. This is required so that there is an additional file version to clean. + String newCommitTime = "00" + index++; + List records = dataGen.generateInsertsForPartition(newCommitTime, 1, partition); + client.startCommitWithTime(newCommitTime); + client.insert(jsc.parallelize(records, 1), newCommitTime).collect(); + + // Initiate another clean. The previous leftover clean will be attempted first, followed by another clean + // due to the commit above. + String newCleanInstantTime = "00" + index++; + HoodieCleanMetadata cleanMetadata = client.clean(newCleanInstantTime); + // subsequent clean should not be triggered since allowMultipleCleanSchedules is set to false + assertNull(cleanMetadata); + + // let the old clean complete + table = HoodieSparkTable.create(writeConfig, context); + cleanMetadata = table.clean(context, cleanInstantTime, false); + assertNotNull(cleanMetadata); + + // any new clean should go ahead + cleanMetadata = client.clean(newCleanInstantTime); + // subsequent clean should not be triggered since allowMultipleCleanSchedules is set to false + assertNotNull(cleanMetadata); + + // 1 file cleaned + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getSuccessDeleteFiles().size(), 1); + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getFailedDeleteFiles().size(), 0); + assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getDeletePathPatterns().size(), 1); + } + } + /** * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective. * @@ -272,7 +341,6 @@ private void testInsertAndCleanByVersions( .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build()) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { @@ -442,7 +510,6 @@ private void testInsertAndCleanByCommits( .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); SparkRDDWriteClient client = getHoodieWriteClient(cfg); @@ -519,7 +586,6 @@ private void testFailedInsertAndCleanByCommits( .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()) .withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); SparkRDDWriteClient client = getHoodieWriteClient(cfg); @@ -571,7 +637,7 @@ private List runCleaner(HoodieWriteConfig config, int firstComm return runCleaner(config, false, firstCommitSequence); } - private List runCleaner(HoodieWriteConfig config, boolean simulateRetryFailure) throws IOException { + protected List runCleaner(HoodieWriteConfig config, boolean simulateRetryFailure) throws IOException { return runCleaner(config, simulateRetryFailure, 1); } @@ -648,7 +714,7 @@ private List runCleaner(HoodieWriteConfig config, boolean simul public void testKeepLatestFileVersions(Boolean enableBootstrapSourceClean) throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean) .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) @@ -811,7 +877,7 @@ public void testKeepLatestFileVersionsMOR() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(1).build()) .build(); @@ -848,8 +914,8 @@ public void testKeepLatestFileVersionsMOR() throws Exception { public void testKeepLatestCommitsMOR() throws Exception { HoodieWriteConfig config = - HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) + HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()) .build(); @@ -889,12 +955,15 @@ public void testKeepLatestCommitsMOR() throws Exception { @Test public void testCleanWithReplaceCommits() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(1) + .withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) .build(); - HoodieTestTable testTable = HoodieTestTable.of(metaClient); + HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context); + HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter); String p0 = "2020/01/01"; String p1 = "2020/01/02"; @@ -903,7 +972,7 @@ public void testCleanWithReplaceCommits() throws Exception { String file1P1C0 = UUID.randomUUID().toString(); testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); - HoodieCommitMetadata commitMetadata = generateCommitMetadata( + HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", Collections.unmodifiableMap(new HashMap>() { { put(p0, CollectionUtils.createImmutableList(file1P0C0)); @@ -911,6 +980,7 @@ public void testCleanWithReplaceCommits() throws Exception { } }) ); + metadataWriter.update(commitMetadata, "00000000000001", false); metaClient.getActiveTimeline().saveAsComplete( new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); @@ -926,7 +996,8 @@ public void testCleanWithReplaceCommits() throws Exception { // notice that clustering generates empty inflight commit files Map partitionAndFileId002 = testTable.forReplaceCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0); String file2P0C1 = partitionAndFileId002.get(p0); - Pair replaceMetadata = generateReplaceCommitMetadata(p0, file1P0C0, file2P0C1); + Pair replaceMetadata = + generateReplaceCommitMetadata("00000000000002", p0, file1P0C0, file2P0C1); testTable.addReplaceCommit("00000000000002", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); // run cleaner @@ -940,7 +1011,7 @@ public void testCleanWithReplaceCommits() throws Exception { // notice that clustering generates empty inflight commit files Map partitionAndFileId003 = testTable.forReplaceCommit("00000000000003").getFileIdsWithBaseFilesInPartitions(p1); String file3P1C2 = partitionAndFileId003.get(p1); - replaceMetadata = generateReplaceCommitMetadata(p1, file1P1C0, file3P1C2); + replaceMetadata = generateReplaceCommitMetadata("00000000000003", p1, file1P1C0, file3P1C2); testTable.addReplaceCommit("00000000000003", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); // run cleaner @@ -955,11 +1026,11 @@ public void testCleanWithReplaceCommits() throws Exception { // notice that clustering generates empty inflight commit files Map partitionAndFileId004 = testTable.forReplaceCommit("00000000000004").getFileIdsWithBaseFilesInPartitions(p0); String file4P0C3 = partitionAndFileId004.get(p0); - replaceMetadata = generateReplaceCommitMetadata(p0, file2P0C1, file4P0C3); + replaceMetadata = generateReplaceCommitMetadata("00000000000004", p0, file2P0C1, file4P0C3); testTable.addReplaceCommit("00000000000004", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); // run cleaner - List hoodieCleanStatsFour = runCleaner(config); + List hoodieCleanStatsFour = runCleaner(config, 5); assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3)); assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2)); @@ -969,12 +1040,12 @@ public void testCleanWithReplaceCommits() throws Exception { // make next replacecommit, with 1 clustering operation. Replace all data in p1. no new files created // notice that clustering generates empty inflight commit files - Map partitionAndFileId005 = testTable.forReplaceCommit("00000000000005").getFileIdsWithBaseFilesInPartitions(p1); + Map partitionAndFileId005 = testTable.forReplaceCommit("00000000000006").getFileIdsWithBaseFilesInPartitions(p1); String file4P1C4 = partitionAndFileId005.get(p1); - replaceMetadata = generateReplaceCommitMetadata(p0, file3P1C2, file4P1C4); - testTable.addReplaceCommit("00000000000005", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); + replaceMetadata = generateReplaceCommitMetadata("00000000000006", p0, file3P1C2, file4P1C4); + testTable.addReplaceCommit("00000000000006", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue()); - List hoodieCleanStatsFive = runCleaner(config, 2); + List hoodieCleanStatsFive = runCleaner(config, 7); assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3)); assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1)); assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2)); @@ -982,9 +1053,8 @@ public void testCleanWithReplaceCommits() throws Exception { assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0)); } - private Pair generateReplaceCommitMetadata(String partition, - String replacedFileId, - String newFileId) { + private Pair generateReplaceCommitMetadata( + String instantTime, String partition, String replacedFileId, String newFileId) { HoodieRequestedReplaceMetadata requestedReplaceMetadata = new HoodieRequestedReplaceMetadata(); requestedReplaceMetadata.setOperationType(WriteOperationType.CLUSTER.toString()); requestedReplaceMetadata.setVersion(1); @@ -1005,7 +1075,7 @@ private Pair genera if (!StringUtils.isNullOrEmpty(newFileId)) { HoodieWriteStat writeStat = new HoodieWriteStat(); writeStat.setPartitionPath(partition); - writeStat.setPath(newFileId); + writeStat.setPath(partition + "/" + getBaseFilename(instantTime, newFileId)); writeStat.setFileId(newFileId); replaceMetadata.addWriteStat(partition, writeStat); } @@ -1180,7 +1250,7 @@ private static void assertCleanMetadataPathEquals(Map expected, } } - private static Stream argumentsForTestKeepLatestCommits() { + protected static Stream argumentsForTestKeepLatestCommits() { return Stream.of( Arguments.of(false, false, false), Arguments.of(true, false, false), @@ -1196,7 +1266,7 @@ private static Stream argumentsForTestKeepLatestCommits() { @MethodSource("argumentsForTestKeepLatestCommits") public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(false).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withIncrementalCleaningMode(enableIncrementalClean) .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) @@ -1216,7 +1286,7 @@ public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIn : UUID.randomUUID().toString(); testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); - HoodieCommitMetadata commitMetadata = generateCommitMetadata( + HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", Collections.unmodifiableMap(new HashMap>() { { put(p0, CollectionUtils.createImmutableList(file1P0C0)); @@ -1240,7 +1310,7 @@ public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIn String file2P0C1 = partitionAndFileId002.get(p0); String file2P1C1 = partitionAndFileId002.get(p1); testTable.forCommit("00000000000002").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); - commitMetadata = generateCommitMetadata(new HashMap>() { + commitMetadata = generateCommitMetadata("00000000000002", new HashMap>() { { put(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1)); put(p1, CollectionUtils.createImmutableList(file1P1C0, file2P1C1)); @@ -1261,9 +1331,9 @@ public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIn .withBaseFilesInPartition(p0, file1P0C0) .withBaseFilesInPartition(p0, file2P0C1) .getFileIdsWithBaseFilesInPartitions(p0).get(p0); - commitMetadata = generateCommitMetadata(CollectionUtils - .createImmutableMap(p0, - CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file3P0C2))); + commitMetadata = generateCommitMetadata("00000000000003", + CollectionUtils.createImmutableMap( + p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file3P0C2))); metaClient.getActiveTimeline().saveAsComplete( new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000003"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); @@ -1278,8 +1348,9 @@ public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIn .withBaseFilesInPartition(p0, file1P0C0) .withBaseFilesInPartition(p0, file2P0C1) .getFileIdsWithBaseFilesInPartitions(p0).get(p0); - commitMetadata = generateCommitMetadata(CollectionUtils.createImmutableMap( - p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file4P0C3))); + commitMetadata = generateCommitMetadata("00000000000004", + CollectionUtils.createImmutableMap( + p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1, file4P0C3))); metaClient.getActiveTimeline().saveAsComplete( new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000004"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); @@ -1305,8 +1376,8 @@ public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIn // No cleaning on partially written file, with no commit. testTable.forCommit("00000000000005").withBaseFilesInPartition(p0, file3P0C2); - commitMetadata = generateCommitMetadata(CollectionUtils.createImmutableMap(p0, - CollectionUtils.createImmutableList(file3P0C2))); + commitMetadata = generateCommitMetadata("00000000000005", + CollectionUtils.createImmutableMap(p0, CollectionUtils.createImmutableList(file3P0C2))); metaClient.getActiveTimeline().createNewInstant( new HoodieInstant(State.REQUESTED, HoodieTimeline.COMMIT_ACTION, "00000000000005")); metaClient.getActiveTimeline().transitionRequestedToInflight( @@ -1325,7 +1396,7 @@ public void testKeepLatestCommits(boolean simulateFailureRetry, boolean enableIn * @return Partition to BootstrapFileMapping Map * @throws IOException */ - private Map> generateBootstrapIndexAndSourceData(String... partitions) throws IOException { + protected Map> generateBootstrapIndexAndSourceData(String... partitions) throws IOException { // create bootstrap source data path java.nio.file.Path sourcePath = tempDir.resolve("data"); java.nio.file.Files.createDirectories(sourcePath); @@ -1378,7 +1449,7 @@ public void testCleanMarkerDataFilesOnRollback() throws Exception { @Test public void testCleaningWithZeroPartitionPaths() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) .build(); @@ -1402,7 +1473,7 @@ public void testCleaningWithZeroPartitionPaths() throws Exception { @Test public void testKeepLatestCommitsWithPendingCompactions() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()) .build(); @@ -1426,7 +1497,7 @@ public void testKeepLatestCommitsWithPendingCompactions() throws Exception { public void testKeepLatestVersionsWithPendingCompactions(boolean retryFailure) throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).enable(true).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) .withCompactionConfig(HoodieCompactionConfig.newBuilder() .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(2).build()) .build(); @@ -1677,14 +1748,15 @@ private Stream> convertPathToFileIdWithCommitTime(final Hoo return Stream.concat(stream1, stream2); } - private static HoodieCommitMetadata generateCommitMetadata(Map> partitionToFilePaths) { + protected static HoodieCommitMetadata generateCommitMetadata( + String instantTime, Map> partitionToFilePaths) { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); - partitionToFilePaths.forEach((key, value) -> value.forEach(f -> { + partitionToFilePaths.forEach((partitionPath, fileList) -> fileList.forEach(f -> { HoodieWriteStat writeStat = new HoodieWriteStat(); - writeStat.setPartitionPath(key); - writeStat.setPath(f); + writeStat.setPartitionPath(partitionPath); + writeStat.setPath(partitionPath + "/" + getBaseFilename(instantTime, f)); writeStat.setFileId(f); - metadata.addWriteStat(key, writeStat); + metadata.addWriteStat(partitionPath, writeStat); })); return metadata; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestConsistencyGuard.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestConsistencyGuard.java index afbe94937949f..22fafe4a58747 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestConsistencyGuard.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestConsistencyGuard.java @@ -169,9 +169,9 @@ private ConsistencyGuardConfig getConsistencyGuardConfig() { return getConsistencyGuardConfig(3, 10, 10); } - private ConsistencyGuardConfig getConsistencyGuardConfig(int maxChecks, int initalSleep, int maxSleep) { + private ConsistencyGuardConfig getConsistencyGuardConfig(int maxChecks, int initialSleep, int maxSleep) { return ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) - .withInitialConsistencyCheckIntervalMs(initalSleep).withMaxConsistencyCheckIntervalMs(maxSleep) + .withInitialConsistencyCheckIntervalMs(initialSleep).withMaxConsistencyCheckIntervalMs(maxSleep) .withMaxConsistencyChecks(maxChecks).build(); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java index 8b8df197ba78b..dcc41addc8f31 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java @@ -21,7 +21,6 @@ import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -44,7 +43,8 @@ import org.apache.hudi.index.HoodieIndex.IndexType; import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; -import org.apache.hudi.table.action.deltacommit.AbstractSparkDeltaCommitActionExecutor; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor; import org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor; import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; @@ -54,10 +54,12 @@ import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.storage.StorageLevel; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; @@ -189,8 +191,10 @@ public void testUpsertPartitioner(boolean populateMetaFields) throws Exception { assertTrue(fileIdToNewSize.entrySet().stream().anyMatch(entry -> fileIdToSize.get(entry.getKey()) < entry.getValue())); - List dataFiles = roView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, + List inputPaths = roView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, false); // Wrote 20 records in 2 batches assertEquals(40, recordsRead.size(), "Must contain 40 records"); @@ -204,8 +208,7 @@ public void testLogFileCountsAfterCompaction(boolean preserveCommitMeta) throws boolean populateMetaFields = true; // insert 100 records HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true, false, HoodieIndex.IndexType.BLOOM, - 1024 * 1024 * 1024L, HoodieClusteringConfig.newBuilder().build(), preserveCommitMeta) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()); + 1024 * 1024 * 1024L, HoodieClusteringConfig.newBuilder().build(), preserveCommitMeta); addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); HoodieWriteConfig config = cfgBuilder.build(); @@ -255,7 +258,7 @@ public void testLogFileCountsAfterCompaction(boolean preserveCommitMeta) throws // Do a compaction String compactionInstantTime = writeClient.scheduleCompaction(Option.empty()).get().toString(); - JavaRDD result = (JavaRDD) writeClient.compact(compactionInstantTime); + HoodieWriteMetadata> result = writeClient.compact(compactionInstantTime); // Verify that recently written compacted data file has no log file metaClient = HoodieTableMetaClient.reload(metaClient); @@ -272,8 +275,7 @@ public void testLogFileCountsAfterCompaction(boolean preserveCommitMeta) throws for (FileSlice slice : groupedLogFiles) { assertEquals(0, slice.getLogFiles().count(), "After compaction there should be no log files visible on a full view"); } - List writeStatuses = result.collect(); - assertTrue(writeStatuses.stream().anyMatch(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath))); + assertTrue(result.getCommitMetadata().get().getWritePartitionPaths().stream().anyMatch(part -> part.contentEquals(partitionPath))); } // Check the entire dataset has all records still @@ -439,8 +441,9 @@ public void testRollingStatsWithSmallFileHandling() throws Exception { // Test small file handling after compaction instantTime = "002"; client.scheduleCompactionAtInstant(instantTime, Option.of(metadata.getExtraMetadata())); - statuses = (JavaRDD) client.compact(instantTime); - client.commitCompaction(instantTime, statuses, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(instantTime); + statuses = compactionMetadata.getWriteStatuses(); + client.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); // Read from commit file table = HoodieSparkTable.create(cfg, context()); @@ -552,7 +555,7 @@ public void testHandleUpdateWithMultiplePartitions() throws Exception { // initialize partitioner hoodieTable.getHoodieView().sync(); - AbstractSparkDeltaCommitActionExecutor actionExecutor = new SparkDeleteDeltaCommitActionExecutor(context(), cfg, hoodieTable, + BaseSparkDeltaCommitActionExecutor actionExecutor = new SparkDeleteDeltaCommitActionExecutor(context(), cfg, hoodieTable, newDeleteTime, deleteRDD); actionExecutor.getUpsertPartitioner(new WorkloadProfile(buildProfile(deleteRDD))); final List> deleteStatus = jsc().parallelize(Arrays.asList(1)).map(x -> { @@ -564,9 +567,52 @@ public void testHandleUpdateWithMultiplePartitions() throws Exception { WriteStatus status = deleteStatus.get(0).get(0); assertTrue(status.hasErrors()); long numRecordsInPartition = fewRecordsForDelete.stream().filter(u -> - u.getPartitionPath().equals(partitionPath)).count(); + u.getPartitionPath().equals(partitionPath)).count(); assertEquals(fewRecordsForDelete.size() - numRecordsInPartition, status.getTotalErrorRecords()); } } + + @Test + public void testReleaseResource() throws Exception { + HoodieWriteConfig.Builder builder = getConfigBuilder(true); + builder.withReleaseResourceEnabled(true); + builder.withAutoCommit(false); + /** + * Write 1 (test when RELEASE_RESOURCE_ENABLE is true) + */ + try (SparkRDDWriteClient client = getHoodieWriteClient(builder.build())) { + + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 20); + JavaRDD writeRecords = jsc().parallelize(records, 1); + writeRecords.persist(StorageLevel.MEMORY_AND_DISK()); + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + client.commitStats(newCommitTime, statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType()); + assertEquals(spark().sparkContext().persistentRdds().size(), 0); + } + + builder.withReleaseResourceEnabled(false); + + /** + * Write 2 (test when RELEASE_RESOURCE_ENABLE is false) + */ + try (SparkRDDWriteClient client = getHoodieWriteClient(builder.build())) { + String newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 20); + JavaRDD writeRecords = jsc().parallelize(records, 1); + + writeRecords.persist(StorageLevel.MEMORY_AND_DISK()); + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + client.commitStats(newCommitTime, statuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()), Option.empty(), metaClient.getCommitActionType()); + assertTrue(spark().sparkContext().persistentRdds().size() > 0); + } + + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java index cba77b0c7e55a..53cd6e5d1e749 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java @@ -22,6 +22,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -177,11 +178,11 @@ public void testUpdateRecords(HoodieIndex.IndexType indexType) throws Exception List records = new ArrayList<>(); RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); - records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); // Insert new records final HoodieSparkCopyOnWriteTable cowTable = table; @@ -210,12 +211,12 @@ public void testUpdateRecords(HoodieIndex.IndexType indexType) throws Exception String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1); - HoodieRecord updatedRecord1 = new HoodieRecord( + HoodieRecord updatedRecord1 = new HoodieAvroRecord( new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1); RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4); HoodieRecord insertedRecord1 = - new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); @@ -290,7 +291,7 @@ private List newHoodieRecords(int n, String time) throws Exception String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i); RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); - records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } return records; } @@ -316,11 +317,11 @@ public void testMetadataAggregateFromWriteStatus() throws Exception { List records = new ArrayList<>(); RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1); - records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2); - records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3); - records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); // Insert new records BaseSparkCommitActionExecutor actionExecutor = new SparkInsertCommitActionExecutor(context, config, table, @@ -416,7 +417,7 @@ public void testFileSizeUpsertRecords() throws Exception { String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; RawTripTestPayload rowChange = new RawTripTestPayload(recordStr); - records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); + records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } // Insert new records diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java index 8617c848729c2..2d852f8107ef0 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestDeleteHelper.java @@ -24,7 +24,7 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaRDD; -import org.apache.hudi.index.bloom.HoodieBloomIndex; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; @@ -66,7 +66,7 @@ private enum CombineTestMode { private static final int DELETE_PARALLELISM = 200; @Mock - private HoodieBloomIndex index; + private HoodieIndex index; @Mock private HoodieTable, JavaRDD, JavaRDD> table; @Mock diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java index 1e5f8029a7145..3039eb3bd9b5f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java @@ -21,7 +21,6 @@ import org.apache.hudi.avro.model.HoodieClusteringPlan; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -219,7 +218,7 @@ public void testPartitionWeight() throws Exception { final String testPartitionPath = "2016/09/26"; int totalInsertNum = 2000; - HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) + HoodieWriteConfig config = makeHoodieClientConfigBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(0) .insertSplitSize(totalInsertNum / 2).autoTuneInsertSplits(false).build()).build(); @@ -374,23 +373,23 @@ public void testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan() throws .setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build(); FileCreateUtils.createRequestedReplaceCommit(basePath,"002", Option.of(requestedReplaceMetadata)); - // create file slice 002 - FileCreateUtils.createBaseFile(basePath, testPartitionPath, "002", "2", 1); - FileCreateUtils.createCommit(basePath, "002"); + // create file slice 003 + FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "3", 1); + FileCreateUtils.createCommit(basePath, "003"); metaClient = HoodieTableMetaClient.reload(metaClient); // generate new data to be ingested HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {testPartitionPath}); - List insertRecords = dataGenerator.generateInserts("003", 100); + List insertRecords = dataGenerator.generateInserts("004", 100); WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords))); HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient); // create UpsertPartitioner UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config); - // for now we have file slice1 and file slice2 and file slice1 is contained in pending clustering plan - // So that only file slice2 can be used for ingestion. + // for now we have file slice1 and file slice3 and file slice1 is contained in pending clustering plan + // So that only file slice3 can be used for ingestion. assertEquals(1, partitioner.smallFiles.size(), "Should have 1 small file to be ingested."); } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java index c2879fb1aaf4c..87d8613303347 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestAsyncCompaction.java @@ -18,11 +18,8 @@ package org.apache.hudi.table.action.compact; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -32,6 +29,9 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.Test; @@ -52,7 +52,6 @@ public class TestAsyncCompaction extends CompactionTestBase { private HoodieWriteConfig getConfig(Boolean autoCommit) { return getConfigBuilder(autoCommit) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); } @@ -204,8 +203,8 @@ public void testScheduleIngestionBeforePendingCompaction() throws Exception { String compactionInstantTime = "006"; int numRecs = 2000; - final List initalRecords = dataGen.generateInserts(firstInstantTime, numRecs); - final List records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), initalRecords, cfg, true, + final List initialRecords = dataGen.generateInserts(firstInstantTime, numRecs); + final List records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), initialRecords, cfg, true, new ArrayList<>()); // Schedule compaction but do not run them diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java index 454c289dbd6d8..9afe5f3533cac 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java @@ -21,7 +21,6 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieRecord; @@ -159,7 +158,6 @@ public void testWriteStatusContentsAfterCompaction() throws Exception { // insert 100 records HoodieWriteConfig config = getConfigBuilder() .withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()) .build(); try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) { String newCommitTime = "100"; @@ -175,7 +173,7 @@ public void testWriteStatusContentsAfterCompaction() throws Exception { List updatedRecords = dataGen.generateUpdates(newCommitTime, records); JavaRDD updatedRecordsRDD = jsc.parallelize(updatedRecords, 1); - HoodieIndex index = new HoodieBloomIndex<>(config, SparkHoodieBloomIndexHelper.getInstance()); + HoodieIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance()); JavaRDD updatedTaggedRecordsRDD = tagLocation(index, updatedRecordsRDD, table); writeClient.startCommitWithTime(newCommitTime); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java index ef52953a2f0c8..310ff4fe8aede 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestInlineCompaction.java @@ -28,6 +28,7 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.marker.WriteMarkersFactory; + import org.junit.jupiter.api.Test; import java.util.ArrayList; @@ -62,7 +63,7 @@ public void testCompactionIsNotScheduledEarly() throws Exception { runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); - // Then: ensure no compaction is executedm since there are only 2 delta commits + // Then: ensure no compaction is executed since there are only 2 delta commits assertEquals(2, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); } } @@ -152,7 +153,7 @@ public void testSuccessfulCompactionBasedOnNumAndTime() throws Exception { runNextDeltaCommits(writeClient, readClient, instants, records, cfg, true, new ArrayList<>()); HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); - // Then: ensure no compaction is executedm since there are only 3 delta commits + // Then: ensure no compaction is executed since there are only 3 delta commits assertEquals(3, metaClient.getActiveTimeline().getWriteTimeline().countInstants()); // 4th commit, that will trigger compaction metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/strategy/TestHoodieCompactionStrategy.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/strategy/TestHoodieCompactionStrategy.java index dee1fadd73d5f..0c7190092e730 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/strategy/TestHoodieCompactionStrategy.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/strategy/TestHoodieCompactionStrategy.java @@ -143,10 +143,10 @@ public void testDayBasedCompactionSimple() { "DayBasedCompactionStrategy should have resulted in fewer compactions"); assertEquals(2, returned.size(), "DayBasedCompactionStrategy should have resulted in fewer compactions"); - int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), + int comparison = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), returned.get(0).getPartitionPath()); // Either the partition paths are sorted in descending order or they are equal - assertTrue(comparision >= 0, "DayBasedCompactionStrategy should sort partitions in descending order"); + assertTrue(comparison >= 0, "DayBasedCompactionStrategy should sort partitions in descending order"); } @Test @@ -192,10 +192,10 @@ public void testBoundedPartitionAwareCompactionSimple() { assertEquals(5, returned.size(), "BoundedPartitionAwareCompactionStrategy should have resulted in fewer compactions"); - int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), + int comparison = strategy.getComparator().compare(returned.get(returned.size() - 1).getPartitionPath(), returned.get(0).getPartitionPath()); // Either the partition paths are sorted in descending order or they are equal - assertTrue(comparision >= 0, "BoundedPartitionAwareCompactionStrategy should sort partitions in descending order"); + assertTrue(comparison >= 0, "BoundedPartitionAwareCompactionStrategy should sort partitions in descending order"); } @Test diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java index 3b0829b1655cb..33a1c58a3a991 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java @@ -33,6 +33,7 @@ import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.Assertions; import org.apache.hudi.testutils.HoodieClientTestBase; + import org.apache.spark.api.java.JavaRDD; import java.io.IOException; @@ -52,7 +53,7 @@ protected void twoUpsertCommitDataWithTwoPartitions(List firstPartiti //just generate two partitions dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); //1. prepare data - HoodieTestDataGenerator.writePartitionMetadata(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); SparkRDDWriteClient client = getHoodieWriteClient(cfg); /** * Write 1 (only inserts) @@ -78,18 +79,18 @@ protected void twoUpsertCommitDataWithTwoPartitions(List firstPartiti } - //2. assert filegroup and get the first partition fileslice + //2. assert file group and get the first partition file slice HoodieTable table = this.getHoodieTable(metaClient, cfg); SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient()); List firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); assertEquals(1, firstPartitionCommit2FileGroups.size()); firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList())); - //3. assert filegroup and get the second partition fileslice + //3. assert file group and get the second partition file slice List secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList()); assertEquals(1, secondPartitionCommit2FileGroups.size()); secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList())); - //4. assert fileslice + //4. assert file slice HoodieTableType tableType = this.getTableType(); if (tableType.equals(HoodieTableType.COPY_ON_WRITE)) { assertEquals(2, firstPartitionCommit2FileSlices.size()); @@ -106,7 +107,7 @@ protected void insertOverwriteCommitDataWithTwoPartitions(List firstP boolean commitSecondInsertOverwrite) throws IOException { //just generate two partitions dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); - HoodieTestDataGenerator.writePartitionMetadata(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); SparkRDDWriteClient client = getHoodieWriteClient(cfg); /** * Write 1 (upsert) diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java index 4e98b220f3613..c9e3fed871acf 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java @@ -20,18 +20,33 @@ import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.FileSystemViewStorageType; +import org.apache.hudi.common.table.view.SyncableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.marker.WriteMarkersFactory; +import org.apache.hudi.testutils.MetadataMergeWriteStatus; +import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -41,9 +56,11 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; @@ -112,7 +129,7 @@ public void testMergeOnReadRollbackActionExecutor(boolean isUsingMarkers) throws assertTrue(meta.getSuccessDeleteFiles() == null || meta.getSuccessDeleteFiles().size() == 0); } - //4. assert filegroup after rollback, and compare to the rollbackstat + //4. assert file group after rollback, and compare to the rollbackstat // assert the first partition data and log file size List firstPartitionRollBack1FileGroups = table.getFileSystemView().getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); assertEquals(1, firstPartitionRollBack1FileGroups.size()); @@ -140,6 +157,131 @@ public void testMergeOnReadRollbackActionExecutor(boolean isUsingMarkers) throws assertFalse(WriteMarkersFactory.get(cfg.getMarkersType(), table, "002").doesMarkerDirExist()); } + @Test + public void testRollbackForCanIndexLogFile() throws IOException { + cleanupResources(); + setUpDFS(); + //1. prepare data and assert data result + //just generate one partitions + dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH}); + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) + .withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2) + .withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION) + .withWriteStatusClass(MetadataMergeWriteStatus.class) + .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()) + .forTable("test-trip-table") + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()) + .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder() + .withEnableBackupForRemoteFileSystemView(false) // Fail test if problem connecting to timeline-server + .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()).withRollbackUsingMarkers(false).withAutoCommit(false).build(); + + //1. prepare data + new HoodieTestDataGenerator().writePartitionMetadata(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH}, basePath); + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + // Write 1 (only inserts) + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH); + JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD statuses = client.upsert(writeRecords, newCommitTime); + org.apache.hudi.testutils.Assertions.assertNoWriteErrors(statuses.collect()); + client.commit(newCommitTime, statuses); + + // check fileSlice + HoodieTable table = this.getHoodieTable(metaClient, cfg); + SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient()); + List firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, firstPartitionCommit2FileGroups.size()); + assertEquals(1, (int) firstPartitionCommit2FileGroups.get(0).getAllFileSlices().count()); + assertFalse(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getBaseFile().isPresent()); + assertEquals(1, firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getLogFiles().count()); + String generatedFileID = firstPartitionCommit2FileGroups.get(0).getFileGroupId().getFileId(); + + // check hoodieCommitMeta + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + table.getMetaClient().getCommitTimeline() + .getInstantDetails(new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "001")) + .get(), + HoodieCommitMetadata.class); + List firstPartitionWriteStat = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH); + assertEquals(2, firstPartitionWriteStat.size()); + // we have an empty writeStat for all partition + assert firstPartitionWriteStat.stream().anyMatch(wStat -> StringUtils.isNullOrEmpty(wStat.getFileId())); + // we have one non-empty writeStat which must contains update or insert + assertEquals(1, firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).count()); + firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).forEach(wStat -> { + assert wStat.getNumInserts() > 0; + }); + + // Write 2 (inserts) + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + List updateRecords = Collections.singletonList(dataGen.generateUpdateRecord(records.get(0).getKey(), newCommitTime)); + List insertRecordsInSamePartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH); + List insertRecordsInOtherPartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_SECOND_PARTITION_PATH); + List recordsToBeWrite = Stream.concat(Stream.concat(updateRecords.stream(), insertRecordsInSamePartition.stream()), insertRecordsInOtherPartition.stream()) + .collect(Collectors.toList()); + writeRecords = jsc.parallelize(recordsToBeWrite, 1); + statuses = client.upsert(writeRecords, newCommitTime); + client.commit(newCommitTime, statuses); + table = this.getHoodieTable(metaClient, cfg); + commitMetadata = HoodieCommitMetadata.fromBytes( + table.getMetaClient().getCommitTimeline() + .getInstantDetails(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime)) + .get(), + HoodieCommitMetadata.class); + assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_FIRST_PARTITION_PATH); + assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_SECOND_PARTITION_PATH); + List hoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH); + // Both update and insert record should enter same existing fileGroup due to small file handling + assertEquals(1, hoodieWriteStatOptionList.size()); + assertEquals(generatedFileID, hoodieWriteStatOptionList.get(0).getFileId()); + // check insert and update numbers + assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts()); + assertEquals(1, hoodieWriteStatOptionList.get(0).getNumUpdateWrites()); + + List secondHoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_SECOND_PARTITION_PATH); + // All insert should enter one fileGroup + assertEquals(1, secondHoodieWriteStatOptionList.size()); + String fileIdInPartitionTwo = secondHoodieWriteStatOptionList.get(0).getFileId(); + assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts()); + + // Rollback + HoodieInstant rollBackInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002"); + BaseRollbackPlanActionExecutor mergeOnReadRollbackPlanActionExecutor = + new BaseRollbackPlanActionExecutor(context, cfg, table, "003", rollBackInstant, false, + cfg.shouldRollbackUsingMarkers()); + mergeOnReadRollbackPlanActionExecutor.execute().get(); + MergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new MergeOnReadRollbackActionExecutor( + context, + cfg, + table, + "003", + rollBackInstant, + true, + false); + + //3. assert the rollback stat + Map rollbackMetadata = mergeOnReadRollbackActionExecutor.execute().getPartitionMetadata(); + assertEquals(2, rollbackMetadata.size()); + + //4. assert filegroup after rollback, and compare to the rollbackstat + // assert the first partition data and log file size + HoodieRollbackPartitionMetadata partitionMetadata = rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH); + assertTrue(partitionMetadata.getSuccessDeleteFiles().isEmpty()); + assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty()); + assertEquals(1, partitionMetadata.getRollbackLogFiles().size()); + + // assert the second partition data and log file size + partitionMetadata = rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH); + assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size()); + assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty()); + assertTrue(partitionMetadata.getRollbackLogFiles().isEmpty()); + assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size()); + } + @Test public void testFailForCompletedInstants() { Assertions.assertThrows(IllegalArgumentException.class, () -> { @@ -163,11 +305,20 @@ public void testRollbackWhenFirstCommitFail() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder() .withRollbackUsingMarkers(false) - .withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build(); + .withPath(basePath).build(); try (SparkRDDWriteClient client = getHoodieWriteClient(config)) { client.startCommitWithTime("001"); client.insert(jsc.emptyRDD(), "001"); client.rollback("001"); } } + + private void setUpDFS() throws IOException { + initDFS(); + initSparkContexts(); + //just generate two partitions + dataGen = new HoodieTestDataGenerator(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); + initFileSystem(); + initDFSMetaClient(); + } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanPlanExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanPlanExecutor.java new file mode 100644 index 0000000000000..961523eb6b993 --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanPlanExecutor.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.functional; + +import org.apache.hudi.common.HoodieCleanStat; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.model.BootstrapFileMapping; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.HoodieTestTable; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.TestCleaner; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.nio.charset.StandardCharsets; +import java.time.Instant; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.List; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestCleanPlanExecutor extends TestCleaner { + + /** + * Tests cleaning service based on number of hours retained. + */ + @ParameterizedTest + @MethodSource("argumentsForTestKeepLatestCommits") + public void testKeepXHoursWithCleaning(boolean simulateFailureRetry, boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withIncrementalCleaningMode(enableIncrementalClean) + .withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER) + .withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean) + .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS).cleanerNumHoursRetained(2).build()) + .build(); + + HoodieTestTable testTable = HoodieTestTable.of(metaClient); + String p0 = "2020/01/01"; + String p1 = "2020/01/02"; + Map> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null; + + String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() + : UUID.randomUUID().toString(); + String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() + : UUID.randomUUID().toString(); + Instant instant = Instant.now(); + ZonedDateTime commitDateTime = ZonedDateTime.ofInstant(instant, ZoneId.systemDefault()); + int minutesForFirstCommit = 150; + String firstCommitTs = HoodieActiveTimeline.formatDate(Date.from(commitDateTime.minusMinutes(minutesForFirstCommit).toInstant())); + testTable.addInflightCommit(firstCommitTs).withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + + HoodieCommitMetadata commitMetadata = generateCommitMetadata(firstCommitTs, + Collections.unmodifiableMap(new HashMap>() { + { + put(p0, CollectionUtils.createImmutableList(file1P0C0)); + put(p1, CollectionUtils.createImmutableList(file1P1C0)); + } + }) + ); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, firstCommitTs), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + + metaClient = HoodieTableMetaClient.reload(metaClient); + + List hoodieCleanStatsOne = runCleaner(config, simulateFailureRetry); + assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files"); + assertTrue(testTable.baseFileExists(p0, firstCommitTs, file1P0C0)); + assertTrue(testTable.baseFileExists(p1, firstCommitTs, file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + int minutesForSecondCommit = 90; + String secondCommitTs = HoodieActiveTimeline.formatDate(Date.from(commitDateTime.minusMinutes(minutesForSecondCommit).toInstant())); + Map partitionAndFileId002 = testTable.addInflightCommit(secondCommitTs).getFileIdsWithBaseFilesInPartitions(p0, p1); + String file2P0C1 = partitionAndFileId002.get(p0); + String file2P1C1 = partitionAndFileId002.get(p1); + testTable.forCommit(secondCommitTs).withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0); + commitMetadata = generateCommitMetadata(secondCommitTs, new HashMap>() { + { + put(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1)); + put(p1, CollectionUtils.createImmutableList(file1P1C0, file2P1C1)); + } + }); + metaClient.getActiveTimeline().saveAsComplete( + new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, secondCommitTs), + Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); + List hoodieCleanStatsTwo = runCleaner(config, simulateFailureRetry); + assertEquals(2, hoodieCleanStatsTwo.size(), "Should clean one file each from both the partitions"); + assertTrue(testTable.baseFileExists(p0, secondCommitTs, file2P0C1)); + assertTrue(testTable.baseFileExists(p1, secondCommitTs, file2P1C1)); + assertTrue(testTable.baseFileExists(p0, secondCommitTs, file1P0C0)); + assertTrue(testTable.baseFileExists(p1, secondCommitTs, file1P1C0)); + assertFalse(testTable.baseFileExists(p0, firstCommitTs, file1P0C0)); + assertFalse(testTable.baseFileExists(p1, firstCommitTs, file1P1C0)); + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java index a0ec0de371478..5438fbcfc0d98 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableClustering.java @@ -132,7 +132,7 @@ void testClustering(boolean doUpdates, boolean populateMetaFields, boolean prese newCommitTime = "003"; client.startCommitWithTime(newCommitTime); records = dataGen.generateUpdates(newCommitTime, 100); - updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime); + updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false); } HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); @@ -201,7 +201,7 @@ void testClusteringWithNoBaseFiles(boolean doUpdates) throws Exception { newCommitTime = "003"; client.startCommitWithTime(newCommitTime); records = dataGen.generateUpdates(newCommitTime, 100); - updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime); + updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false); } HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java index 13903bf54b70d..f4f47d375b22d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java @@ -22,6 +22,7 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; @@ -113,13 +114,14 @@ private List writeData(String instant, int numRecords, boolean doCo JavaRDD records = jsc().parallelize(dataGen.generateInserts(instant, numRecords), 2); metaClient = HoodieTableMetaClient.reload(metaClient); client.startCommitWithTime(instant); - List writeStatues = client.upsert(records, instant).collect(); - org.apache.hudi.testutils.Assertions.assertNoWriteErrors(writeStatues); + List writeStatuses = client.upsert(records, instant).collect(); + org.apache.hudi.testutils.Assertions.assertNoWriteErrors(writeStatuses); if (doCommit) { - Assertions.assertTrue(client.commitStats(instant, writeStatues.stream().map(WriteStatus::getStat).collect(Collectors.toList()), - Option.empty(), metaClient.getCommitActionType())); + List writeStats = writeStatuses.stream().map(WriteStatus::getStat).collect(Collectors.toList()); + boolean committed = client.commitStats(instant, writeStats, Option.empty(), metaClient.getCommitActionType()); + Assertions.assertTrue(committed); } metaClient = HoodieTableMetaClient.reload(metaClient); - return writeStatues; + return writeStatuses; } } \ No newline at end of file diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java index c80374b64f4a2..5df7b4daecc72 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableIncrementalRead.java @@ -20,7 +20,6 @@ package org.apache.hudi.table.functional; import org.apache.hudi.client.SparkRDDWriteClient; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; @@ -86,7 +85,7 @@ public void testIncrementalReadsWithCompaction() throws Exception { Properties props = new Properties(); props.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieFileFormat.PARQUET.toString()); HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, props); - HoodieWriteConfig cfg = getConfigBuilder(true).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()).build(); + HoodieWriteConfig cfg = getConfigBuilder(true).build(); try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { /* @@ -118,7 +117,7 @@ public void testIncrementalReadsWithCompaction() throws Exception { String updateTime = "004"; client.startCommitWithTime(updateTime); List records004 = dataGen.generateUpdates(updateTime, 100); - updateRecordsInMORTable(metaClient, records004, client, cfg, updateTime); + updateRecordsInMORTable(metaClient, records004, client, cfg, updateTime, false); // verify RO incremental reads - only one base file shows up because updates to into log files incrementalROFiles = getROIncrementalFiles(partitionPath, false); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java index 62ce007496683..2955147b4053f 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableInsertUpdateDelete.java @@ -19,6 +19,7 @@ package org.apache.hudi.table.functional; +import org.apache.hadoop.fs.Path; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.FileSlice; @@ -27,6 +28,7 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -35,10 +37,12 @@ import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; @@ -54,6 +58,7 @@ import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; +import java.util.Collection; import java.util.List; import java.util.Properties; import java.util.stream.Collectors; @@ -104,7 +109,7 @@ public void testSimpleInsertAndUpdate(HoodieFileFormat fileFormat, boolean popul newCommitTime = "004"; client.startCommitWithTime(newCommitTime); records = dataGen.generateUpdates(newCommitTime, 100); - updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime); + updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false); String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString(); client.compact(compactionCommitTime); @@ -133,6 +138,48 @@ public void testSimpleInsertAndUpdate(HoodieFileFormat fileFormat, boolean popul } } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testInlineScheduleCompaction(boolean scheduleInlineCompaction) throws Exception { + HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET; + Properties properties = new Properties(); + properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString()); + HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); + + HoodieWriteConfig cfg = getConfigBuilder(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) + .withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()) + .build(); + try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + /* + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 200); + Stream dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true); + assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit"); + + /* + * Write 2 (updates) + */ + newCommitTime = "004"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUpdates(newCommitTime, 100); + updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true); + + // verify that there is a commit + if (scheduleInlineCompaction) { + assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 1); + } else { + assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 0); + } + } + } + @ParameterizedTest @ValueSource(booleans = {true, false}) public void testSimpleInsertUpdateAndDelete(boolean populateMetaFields) throws Exception { @@ -213,8 +260,11 @@ public void testSimpleInsertUpdateAndDelete(boolean populateMetaFields) throws E dataFilesToRead = tableView.getLatestBaseFiles(); assertTrue(dataFilesToRead.findAny().isPresent()); - List dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, basePath(), new JobConf(hadoopConf()), true, false); + List inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = + HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, false); // Wrote 20 records and deleted 20 records, so remaining 20-20 = 0 assertEquals(0, recordsRead.size(), "Must contain 0 records"); } @@ -260,11 +310,12 @@ public void testSimpleInsertsGeneratedIntoLogFiles() throws Exception { assertTrue(numLogFiles > 0); // Do a compaction String instantTime = writeClient.scheduleCompaction(Option.empty()).get().toString(); - statuses = (JavaRDD) writeClient.compact(instantTime); + HoodieWriteMetadata> compactionMetadata = writeClient.compact(instantTime); String extension = table.getBaseFileExtension(); - assertEquals(numLogFiles, statuses.map(status -> status.getStat().getPath().contains(extension)).count()); - assertEquals(numLogFiles, statuses.count()); - writeClient.commitCompaction(instantTime, statuses, Option.empty()); + Collection> stats = compactionMetadata.getCommitMetadata().get().getPartitionToWriteStats().values(); + assertEquals(numLogFiles, stats.stream().flatMap(Collection::stream).filter(state -> state.getPath().contains(extension)).count()); + assertEquals(numLogFiles, stats.stream().mapToLong(Collection::size).sum()); + writeClient.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); } } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java index 38becc92c65ff..d552955030baa 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableRollback.java @@ -24,10 +24,13 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFileGroup; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.marker.MarkerType; @@ -40,6 +43,7 @@ import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.config.HoodieWriteConfig; @@ -47,6 +51,7 @@ import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; @@ -64,6 +69,7 @@ import java.nio.file.Files; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -142,8 +148,14 @@ void testCOWToMORConvertedTableRollback(boolean rollbackUsingMarkers) throws Exc @ParameterizedTest @ValueSource(booleans = {true, false}) void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) throws Exception { - HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.SIMPLE) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()); + // NOTE: First writer will have Metadata table DISABLED + HoodieWriteConfig.Builder cfgBuilder = + getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.SIMPLE) + .withMetadataConfig( + HoodieMetadataConfig.newBuilder() + .enable(false) + .build()); + addConfigsForPopulateMetaFields(cfgBuilder, true); HoodieWriteConfig cfg = cfgBuilder.build(); @@ -166,10 +178,12 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) thro JavaRDD writeRecords = jsc().parallelize(records, 1); JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); - client.commit(newCommitTime, writeStatusJavaRDD); + List statuses = writeStatusJavaRDD.collect(); assertNoWriteErrors(statuses); + client.commit(newCommitTime, jsc().parallelize(statuses)); + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); @@ -194,6 +208,7 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) thro */ final String commitTime1 = "002"; // WriteClient with custom config (disable small file handling) + // NOTE: Second writer will have Metadata table ENABLED try (SparkRDDWriteClient secondClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(false));) { secondClient.startCommitWithTime(commitTime1); @@ -201,8 +216,10 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) thro copyOfRecords = dataGen.generateUpdates(commitTime1, copyOfRecords); copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200)); - List dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, + List inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath()); assertEquals(200, recordsRead.size()); @@ -218,8 +235,10 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) thro .contains(commitTime1)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList()); assertEquals(0, remainingFiles.size(), "There files should have been rolled-back " + "when rolling back commit " + commitTime1 + " but are still remaining. Files: " + remainingFiles); - dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, basePath()); + inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath()); assertEquals(200, recordsRead.size()); } @@ -234,8 +253,10 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) thro copyOfRecords = dataGen.generateUpdates(commitTime2, copyOfRecords); copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200)); - List dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, + List inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath()); assertEquals(200, recordsRead.size()); @@ -255,8 +276,10 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) thro metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); - recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, basePath()); + inputPaths = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); + recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath()); // check that the number of records read is still correct after rollback operation assertEquals(200, recordsRead.size()); @@ -268,11 +291,13 @@ void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) thro thirdClient.startCommitWithTime(newCommitTime); writeStatusJavaRDD = thirdClient.upsert(writeRecords, newCommitTime); + statuses = writeStatusJavaRDD.collect(); - thirdClient.commit(newCommitTime, writeStatusJavaRDD); // Verify there are no errors assertNoWriteErrors(statuses); + thirdClient.commit(newCommitTime, jsc().parallelize(statuses)); + metaClient = HoodieTableMetaClient.reload(metaClient); String compactionInstantTime = thirdClient.scheduleCompaction(Option.empty()).get().toString(); @@ -300,8 +325,7 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { boolean populateMetaFields = true; HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false) // Timeline-server-based markers are not used for multi-rollback tests - .withMarkersType(MarkerType.DIRECT.name()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()); + .withMarkersType(MarkerType.DIRECT.name()); addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields); HoodieWriteConfig cfg = cfgBuilder.build(); @@ -310,8 +334,8 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties); try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) { - HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + /* * Write 1 (only inserts) */ @@ -322,20 +346,29 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { JavaRDD writeRecords = jsc().parallelize(records, 1); JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); - client.commit(newCommitTime, writeStatusJavaRDD); + List statuses = writeStatusJavaRDD.collect(); assertNoWriteErrors(statuses); + + client.commit(newCommitTime, jsc().parallelize(statuses)); client.close(); - HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + Option> instantCommitMetadataPairOpt = + metaClient.getActiveTimeline().getLastCommitMetadataWithValidData(); - Option deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant(); - assertTrue(deltaCommit.isPresent()); - assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001"); + assertTrue(instantCommitMetadataPairOpt.isPresent()); + + HoodieInstant commitInstant = instantCommitMetadataPairOpt.get().getKey(); + + assertEquals("001", commitInstant.getTimestamp()); + assertEquals(HoodieTimeline.DELTA_COMMIT_ACTION, commitInstant.getAction()); + assertEquals(200, getTotalRecordsWritten(instantCommitMetadataPairOpt.get().getValue())); Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); assertFalse(commit.isPresent()); + HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); + FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); Stream dataFilesToRead = tableView.getLatestBaseFiles(); @@ -345,6 +378,7 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { dataFilesToRead = tableView.getLatestBaseFiles(); assertTrue(dataFilesToRead.findAny().isPresent(), "Should list the base files we wrote in the delta commit"); + /* * Write 2 (inserts + updates) */ @@ -352,8 +386,7 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { // WriteClient with custom config (disable small file handling) HoodieWriteConfig smallFileWriteConfig = getHoodieWriteConfigWithSmallFileHandlingOffBuilder(populateMetaFields) // Timeline-server-based markers are not used for multi-rollback tests - .withMarkersType(MarkerType.DIRECT.name()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()).build(); + .withMarkersType(MarkerType.DIRECT.name()).build(); try (SparkRDDWriteClient nClient = getHoodieWriteClient(smallFileWriteConfig)) { nClient.startCommitWithTime(newCommitTime); @@ -361,7 +394,9 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { copyOfRecords = dataGen.generateUpdates(newCommitTime, copyOfRecords); copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200)); - List dataFiles = tableView.getLatestBaseFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); + List dataFiles = tableView.getLatestBaseFiles() + .map(baseFile -> new Path(baseFile.getPath()).getParent().toString()) + .collect(Collectors.toList()); List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, basePath()); assertEquals(200, recordsRead.size()); @@ -369,7 +404,9 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { statuses = nClient.upsert(jsc().parallelize(copyOfRecords, 1), newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); - nClient.commit(newCommitTime, writeStatusJavaRDD); + + nClient.commit(newCommitTime, jsc().parallelize(statuses)); + copyOfRecords.clear(); } @@ -386,11 +423,12 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { writeRecords = jsc().parallelize(records, 1); writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); - client.commit(newCommitTime, writeStatusJavaRDD); statuses = writeStatusJavaRDD.collect(); // Verify there are no errors assertNoWriteErrors(statuses); + client.commit(newCommitTime, jsc().parallelize(statuses)); + metaClient = HoodieTableMetaClient.reload(metaClient); String compactionInstantTime = "004"; @@ -407,17 +445,18 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { writeRecords = jsc().parallelize(records, 1); writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); - client.commit(newCommitTime, writeStatusJavaRDD); statuses = writeStatusJavaRDD.collect(); // Verify there are no errors assertNoWriteErrors(statuses); + client.commit(newCommitTime, jsc().parallelize(statuses)); + metaClient = HoodieTableMetaClient.reload(metaClient); compactionInstantTime = "006"; client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); - JavaRDD ws = (JavaRDD) client.compact(compactionInstantTime); - client.commitCompaction(compactionInstantTime, ws, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(compactionInstantTime); + client.commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); allFiles = listAllBaseFilesInPath(hoodieTable); metaClient = HoodieTableMetaClient.reload(metaClient); @@ -440,7 +479,9 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { statuses = client.upsert(jsc().parallelize(copyOfRecords, 1), newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); - client.commit(newCommitTime, writeStatusJavaRDD); + + client.commit(newCommitTime, jsc().parallelize(statuses)); + copyOfRecords.clear(); // Rollback latest commit first @@ -464,13 +505,19 @@ void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception { } } + private long getTotalRecordsWritten(HoodieCommitMetadata commitMetadata) { + return commitMetadata.getPartitionToWriteStats().values().stream() + .flatMap(Collection::stream) + .map(stat -> stat.getNumWrites() + stat.getNumUpdateWrites()) + .reduce(0L, Long::sum); + } + @ParameterizedTest @ValueSource(booleans = {true, false}) void testMORTableRestore(boolean restoreAfterCompaction) throws Exception { HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false) // Timeline-server-based markers are not used for multi-rollback tests - .withMarkersType(MarkerType.DIRECT.name()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).build()); + .withMarkersType(MarkerType.DIRECT.name()); HoodieWriteConfig cfg = cfgBuilder.build(); Properties properties = new Properties(); @@ -495,8 +542,8 @@ void testMORTableRestore(boolean restoreAfterCompaction) throws Exception { metaClient = HoodieTableMetaClient.reload(metaClient); String compactionInstantTime = "005"; client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty()); - JavaRDD ws = (JavaRDD) client.compact(compactionInstantTime); - client.commitCompaction(compactionInstantTime, ws, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(compactionInstantTime); + client.commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); validateRecords(cfg, metaClient, updates3); List updates4 = updateAndGetRecords("006", client, dataGen, records); @@ -516,8 +563,6 @@ private List insertAndGetRecords(String newCommitTime, SparkRDDWri JavaRDD writeRecords = jsc().parallelize(records, 1); JavaRDD writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime); client.commit(newCommitTime, writeStatusJavaRDD); - List statuses = writeStatusJavaRDD.collect(); - assertNoWriteErrors(statuses); return records; } @@ -534,8 +579,10 @@ private void validateRecords(HoodieWriteConfig cfg, HoodieTableMetaClient metaCl HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient); FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable); HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); - List dataFiles = tableView.getLatestBaseFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); - List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, + List inputPaths = tableView.getLatestBaseFiles() + .map(hf -> new Path(hf.getPath()).getParent().toString()) + .collect(Collectors.toList()); + List recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath()); assertRecords(expectedRecords, recordsRead); } @@ -545,7 +592,7 @@ private void assertRecords(List inputRecords, List Map expectedRecords = new HashMap<>(); inputRecords.forEach(entry -> { try { - expectedRecords.put(entry.getRecordKey(), ((GenericRecord) entry.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get())); + expectedRecords.put(entry.getRecordKey(), (GenericRecord) ((HoodieRecordPayload) entry.getData()).getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get()); } catch (IOException e) { e.printStackTrace(); } @@ -596,9 +643,8 @@ void testInsertsGeneratedIntoLogFilesRollback(boolean rollbackUsingMarkers) thro List records = dataGen.generateInserts(newCommitTime, 100); JavaRDD recordsRDD = jsc().parallelize(records, 1); - JavaRDD statuses = writeClient.insert(recordsRDD, newCommitTime); // trigger an action - List writeStatuses = statuses.collect(); + List writeStatuses = ((JavaRDD) writeClient.insert(recordsRDD, newCommitTime)).collect(); // Ensure that inserts are written to only log files assertEquals(0, @@ -708,11 +754,14 @@ void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(boolean rollbackUsi assertTrue(numLogFiles > 0); // Do a compaction newCommitTime = writeClient.scheduleCompaction(Option.empty()).get().toString(); - statuses = (JavaRDD) writeClient.compact(newCommitTime); + HoodieWriteMetadata> compactionMetadata = writeClient.compact(newCommitTime); + statuses = compactionMetadata.getWriteStatuses(); // Ensure all log files have been compacted into base files String extension = table.getBaseFileExtension(); - assertEquals(numLogFiles, statuses.map(status -> status.getStat().getPath().contains(extension)).count()); - assertEquals(numLogFiles, statuses.count()); + Collection> stats = compactionMetadata.getCommitMetadata().get().getPartitionToWriteStats().values(); + assertEquals(numLogFiles, stats.stream().flatMap(Collection::stream).filter(state -> state.getPath().contains(extension)).count()); + assertEquals(numLogFiles, stats.stream().mapToLong(Collection::size).sum()); + //writeClient.commitCompaction(newCommitTime, statuses, Option.empty()); // Trigger a rollback of compaction table.getActiveTimeline().reload(); @@ -815,14 +864,15 @@ private List updateRecords(SparkRDDWriteClient client, HoodieTestD private long doCompaction(SparkRDDWriteClient client, HoodieTableMetaClient metaClient, HoodieWriteConfig cfg, long numLogFiles) throws IOException { // Do a compaction String instantTime = client.scheduleCompaction(Option.empty()).get().toString(); - JavaRDD writeStatuses = (JavaRDD) client.compact(instantTime); + HoodieWriteMetadata> compactionMetadata = client.compact(instantTime); metaClient.reloadActiveTimeline(); HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient); String extension = table.getBaseFileExtension(); - assertEquals(numLogFiles, writeStatuses.map(status -> status.getStat().getPath().contains(extension)).count()); - assertEquals(numLogFiles, writeStatuses.count()); - client.commitCompaction(instantTime, writeStatuses, Option.empty()); + Collection> stats = compactionMetadata.getCommitMetadata().get().getPartitionToWriteStats().values(); + assertEquals(numLogFiles, stats.stream().flatMap(Collection::stream).filter(state -> state.getPath().contains(extension)).count()); + assertEquals(numLogFiles, stats.stream().mapToLong(Collection::size).sum()); + client.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty()); return numLogFiles; } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java index 8b23cf25768e3..fd2af1cdca25a 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestMarkerBasedRollbackStrategy.java @@ -133,7 +133,6 @@ public void testCopyOnWriteRollback(boolean useFileListingMetadata) throws Excep assertEquals(1, stat.getSuccessDeleteFiles().size()); assertEquals(0, stat.getFailedDeleteFiles().size()); assertEquals(0, stat.getCommandBlocksCount().size()); - assertEquals(0, stat.getWrittenLogFileSizeMap().size()); } } } @@ -162,8 +161,6 @@ public void testMergeOnReadRollback(boolean useFileListingMetadata) throws Excep assertEquals(0, stat.getFailedDeleteFiles().size()); assertEquals(1, stat.getCommandBlocksCount().size()); stat.getCommandBlocksCount().forEach((fileStatus, len) -> assertTrue(fileStatus.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension()))); - assertEquals(1, stat.getWrittenLogFileSizeMap().size()); - stat.getWrittenLogFileSizeMap().forEach((fileStatus, len) -> assertTrue(fileStatus.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension()))); } } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersBase.java index 5f96041b372d9..6ba783c749ffb 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/marker/TestWriteMarkersBase.java @@ -103,7 +103,7 @@ public void testDeletionWhenMarkerDirNotExists() throws IOException { @ParameterizedTest @ValueSource(booleans = {true, false}) public void testDataPathsWhenCreatingOrMerging(boolean isTablePartitioned) throws IOException { - // add markfiles + // add marker files createSomeMarkers(isTablePartitioned); // add invalid file createInvalidFile(isTablePartitioned ? "2020/06/01" : "", "invalid_file3"); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java index 5f5dfdec5dce4..403b67e554d76 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/upgrade/TestUpgradeDowngrade.java @@ -232,6 +232,43 @@ public void testUpgradeOneToTwo(HoodieTableType tableType) throws IOException { assertTableProps(cfg); } + @Test + public void testUpgradeDowngradeBetweenThreeAndCurrentVersion() throws IOException { + // init config, table and client. + Map params = new HashMap<>(); + addNewTableParamsToProps(params); + HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(false).withProps(params).build(); + + // write inserts + SparkRDDWriteClient client = getHoodieWriteClient(cfg); + doInsert(client); + + // current version should have TABLE_CHECKSUM key + assertEquals(HoodieTableVersion.current(), metaClient.getTableConfig().getTableVersion()); + assertTableVersionFromPropertyFile(HoodieTableVersion.current()); + assertTrue(metaClient.getTableConfig().getProps().containsKey(HoodieTableConfig.TABLE_CHECKSUM.key())); + String checksum = metaClient.getTableConfig().getProps().getString(HoodieTableConfig.TABLE_CHECKSUM.key()); + + // downgrade to version 3 and check TABLE_CHECKSUM is still present + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.THREE, null); + assertEquals(HoodieTableVersion.THREE.versionCode(), metaClient.getTableConfig().getTableVersion().versionCode()); + assertTableVersionFromPropertyFile(HoodieTableVersion.THREE); + assertTrue(metaClient.getTableConfig().getProps().containsKey(HoodieTableConfig.TABLE_CHECKSUM.key())); + assertEquals(checksum, metaClient.getTableConfig().getProps().getString(HoodieTableConfig.TABLE_CHECKSUM.key())); + + // remove TABLE_CHECKSUM and upgrade to current version + metaClient.getTableConfig().getProps().remove(HoodieTableConfig.TABLE_CHECKSUM.key()); + new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.current(), null); + + // verify upgrade and TABLE_CHECKSUM + metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build(); + assertEquals(HoodieTableVersion.current().versionCode(), metaClient.getTableConfig().getTableVersion().versionCode()); + assertTableVersionFromPropertyFile(HoodieTableVersion.current()); + assertTrue(metaClient.getTableConfig().getProps().containsKey(HoodieTableConfig.TABLE_CHECKSUM.key())); + assertEquals(checksum, metaClient.getTableConfig().getProps().getString(HoodieTableConfig.TABLE_CHECKSUM.key())); + } + private void addNewTableParamsToProps(Map params) { params.put(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid"); params.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partition_path"); @@ -497,7 +534,7 @@ private Pair, List> twoUpsertCommitDataWithTwoP //just generate two partitions dataGen = new HoodieTestDataGenerator(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); //1. prepare data - HoodieTestDataGenerator.writePartitionMetadata(metaClient.getFs(), new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); + HoodieTestDataGenerator.writePartitionMetadataDeprecated(metaClient.getFs(), new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); /** * Write 1 (only inserts) */ diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java index ee3c309b30f28..16fd48af6c014 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestBase.java @@ -23,6 +23,7 @@ import org.apache.hudi.common.HoodieCleanStat; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.model.EmptyHoodieRecordPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodiePartitionMetadata; @@ -274,7 +275,7 @@ private Function> wrapDeleteKeysGenFunctionForPreppedCa final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build(); HoodieSparkTable table = HoodieSparkTable.create(writeConfig, context, metaClient); JavaRDD recordsToDelete = jsc.parallelize(records, 1) - .map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload())); + .map(key -> new HoodieAvroRecord(key, new EmptyHoodieRecordPayload())); JavaRDD taggedRecords = tagLocation(index, recordsToDelete, table); return taggedRecords.map(record -> record.getKey()).collect(); }; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java index 906f13d7a63b8..f339f5ed910db 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java @@ -59,7 +59,6 @@ import org.apache.hudi.metadata.HoodieBackedTableMetadataWriter; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataWriter; -import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter; import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; @@ -680,7 +679,7 @@ private void runFullValidation(HoodieWriteConfig writeConfig, String metadataTab // in the .hoodie folder. List metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, HoodieTableMetadata.getMetadataTableBasePath(basePath), false, false); - Assertions.assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size()); + Assertions.assertEquals(metadataWriter.getEnabledPartitionTypes().size(), metadataTablePartitions.size()); // Metadata table should automatically compact and clean // versions are +1 as autoClean / compaction happens end of commits diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java index 6dffd535b9145..05d7f99446e94 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java @@ -207,7 +207,7 @@ public static List getLatestBaseFiles(String basePath, FileSyste } /** - * Reads the paths under the a hoodie table out as a DataFrame. + * Reads the paths under the hoodie table out as a DataFrame. */ public static Dataset read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs, String... paths) { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java index 37a58fb3ecfb7..c2256f40c6b98 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/KeyGeneratorTestUtilities.java @@ -18,12 +18,10 @@ package org.apache.hudi.testutils; -import org.apache.hudi.AvroConversionHelper; -import org.apache.hudi.AvroConversionUtils; - import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.AvroConversionUtils; import org.apache.spark.package$; import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.InternalRow; @@ -33,16 +31,15 @@ import org.apache.spark.sql.catalyst.expressions.Attribute; import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; import org.apache.spark.sql.types.StructType; +import scala.Function1; +import scala.collection.JavaConversions; +import scala.collection.JavaConverters; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.List; import java.util.stream.Collectors; -import scala.Function1; -import scala.collection.JavaConversions; -import scala.collection.JavaConverters; - public class KeyGeneratorTestUtilities { public static final String NESTED_COL_SCHEMA = "{\"type\":\"record\", \"name\":\"nested_col\",\"fields\": [" @@ -51,8 +48,7 @@ public class KeyGeneratorTestUtilities { + "{\"name\": \"timestamp\",\"type\": \"long\"},{\"name\": \"_row_key\", \"type\": \"string\"}," + "{\"name\": \"ts_ms\", \"type\": \"string\"}," + "{\"name\": \"pii_col\", \"type\": \"string\"}," - + "{\"name\": \"nested_col\",\"type\": " - + NESTED_COL_SCHEMA + "}" + + "{\"name\": \"nested_col\",\"type\": [\"null\", " + NESTED_COL_SCHEMA + "]}" + "]}"; public static final String TEST_STRUCTNAME = "test_struct_name"; @@ -86,8 +82,8 @@ public static Row getRow(GenericRecord record) { } public static Row getRow(GenericRecord record, Schema schema, StructType structType) { - Function1 converterFn = AvroConversionHelper.createConverterToRow(schema, structType); - Row row = (Row) converterFn.apply(record); + Function1 converterFn = AvroConversionUtils.createConverterToRow(schema, structType); + Row row = converterFn.apply(record); int fieldCount = structType.fieldNames().length; Object[] values = new Object[fieldCount]; for (int i = 0; i < fieldCount; i++) { diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java index fb19a63259e19..94e080cae4804 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/SparkClientFunctionalTestHarness.java @@ -214,12 +214,22 @@ protected JavaRDD updateLocation( } protected Stream insertRecordsToMORTable(HoodieTableMetaClient metaClient, List records, - SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime) throws IOException { + SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime) throws IOException { + return insertRecordsToMORTable(metaClient, records, client, cfg, commitTime, false); + } + + protected Stream insertRecordsToMORTable(HoodieTableMetaClient metaClient, List records, + SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime, + boolean doExplicitCommit) throws IOException { HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(metaClient); JavaRDD writeRecords = jsc().parallelize(records, 1); - List statuses = client.insert(writeRecords, commitTime).collect(); + JavaRDD statusesRdd = client.insert(writeRecords, commitTime); + List statuses = statusesRdd.collect(); assertNoWriteErrors(statuses); + if (doExplicitCommit) { + client.commit(commitTime, statusesRdd); + } assertFileSizesEqual(statuses, status -> FSUtils.getFileSize(reloadedMetaClient.getFs(), new Path(reloadedMetaClient.getBasePath(), status.getStat().getPath()))); HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), reloadedMetaClient); @@ -243,6 +253,11 @@ protected Stream insertRecordsToMORTable(HoodieTableMetaClient m } protected void updateRecordsInMORTable(HoodieTableMetaClient metaClient, List records, SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime) throws IOException { + updateRecordsInMORTable(metaClient, records, client, cfg, commitTime, true); + } + + protected void updateRecordsInMORTable(HoodieTableMetaClient metaClient, List records, SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime, + boolean doExplicitCommit) throws IOException { HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(metaClient); Map recordsMap = new HashMap<>(); @@ -252,9 +267,13 @@ protected void updateRecordsInMORTable(HoodieTableMetaClient metaClient, List statuses = client.upsert(jsc().parallelize(records, 1), commitTime).collect(); + JavaRDD statusesRdd = client.upsert(jsc().parallelize(records, 1), commitTime); + List statuses = statusesRdd.collect(); // Verify there are no errors assertNoWriteErrors(statuses); + if (doExplicitCommit) { + client.commit(commitTime, statusesRdd); + } assertFileSizesEqual(statuses, status -> FSUtils.getFileSize(reloadedMetaClient.getFs(), new Path(reloadedMetaClient.getBasePath(), status.getStat().getPath()))); Option deltaCommit = reloadedMetaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index e19070a6f9afe..1a558aeae3326 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -36,61 +36,7 @@ - - - - net.alchim31.maven - scala-maven-plugin - ${scala-maven-plugin.version} - - - -nobootcp - -target:jvm-1.8 - - false - - - - org.apache.maven.plugins - maven-compiler-plugin - - - - - - net.alchim31.maven - scala-maven-plugin - - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - - compile - - compile - - - - org.apache.maven.plugins maven-jar-plugin @@ -111,10 +57,6 @@ org.apache.rat apache-rat-plugin - - org.scalastyle - scalastyle-maven-plugin - org.jacoco jacoco-maven-plugin @@ -156,13 +98,6 @@ - - - org.scala-lang - scala-library - ${scala.version} - - com.fasterxml.jackson.core diff --git a/hudi-common/src/main/avro/HoodieClusteringGroup.avsc b/hudi-common/src/main/avro/HoodieClusteringGroup.avsc index b2444be84aa00..f2af6b68db873 100644 --- a/hudi-common/src/main/avro/HoodieClusteringGroup.avsc +++ b/hudi-common/src/main/avro/HoodieClusteringGroup.avsc @@ -19,7 +19,6 @@ "namespace":"org.apache.hudi.avro.model", "type":"record", "name":"HoodieClusteringGroup", - "type":"record", "fields":[ { /* Group of files that needs to merged. All the slices in a group will belong to same partition initially. diff --git a/hudi-common/src/main/avro/HoodieMetadata.avsc b/hudi-common/src/main/avro/HoodieMetadata.avsc index bf85587a3a7ac..4037dd0f1ab01 100644 --- a/hudi-common/src/main/avro/HoodieMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieMetadata.avsc @@ -30,27 +30,142 @@ "doc": "Type of the metadata record", "type": "int" }, - { "name": "filesystemMetadata", + { "doc": "Contains information about partitions and files within the dataset", - "type": ["null", { - "type": "map", - "values": { + "name": "filesystemMetadata", + "type": [ + "null", + { + "type": "map", + "values": { + "type": "record", + "name": "HoodieMetadataFileInfo", + "fields": [ + { + "name": "size", + "type": "long", + "doc": "Size of the file" + }, + { + "name": "isDeleted", + "type": "boolean", + "doc": "True if this file has been deleted" + } + ] + } + } + ] + }, + { + "doc": "Metadata Index of bloom filters for all data files in the user table", + "name": "BloomFilterMetadata", + "type": [ + "null", + { + "doc": "Data file bloom filter details", + "name": "HoodieMetadataBloomFilter", "type": "record", - "name": "HoodieMetadataFileInfo", "fields": [ { - "name": "size", - "type": "long", - "doc": "Size of the file" + "doc": "Bloom filter type code", + "name": "type", + "type": "string" + }, + { + "doc": "Instant timestamp when this metadata was created/updated", + "name": "timestamp", + "type": "string" + }, + { + "doc": "Bloom filter binary byte array", + "name": "bloomFilter", + "type": "bytes" + }, + { + "doc": "Bloom filter entry valid/deleted flag", + "name": "isDeleted", + "type": "boolean" + } + ] + } + ], + "default" : null + }, + { + "doc": "Metadata Index of column statistics for all data files in the user table", + "name": "ColumnStatsMetadata", + "type": [ + "null", + { + "doc": "Data file column statistics", + "name": "HoodieMetadataColumnStats", + "type": "record", + "fields": [ + { + "doc": "File name for which this column statistics applies", + "name": "fileName", + "type": [ + "null", + "string" + ] + }, + { + "doc": "Minimum value in the range. Based on user data table schema, we can convert this to appropriate type", + "name": "minValue", + "type": [ + "null", + "string" + ] + }, + { + "doc": "Maximum value in the range. Based on user data table schema, we can convert it to appropriate type", + "name": "maxValue", + "type": [ + "null", + "string" + ] + }, + { + "doc": "Total count of values", + "name": "valueCount", + "type": [ + "null", + "long" + ] + }, + { + "doc": "Total count of null values", + "name": "nullCount", + "type": [ + "null", + "long" + ] + }, + { + "doc": "Total storage size on disk", + "name": "totalSize", + "type": [ + "null", + "long" + ] + }, + { + "doc": "Total uncompressed storage size on disk", + "name": "totalUncompressedSize", + "type": [ + "null", + "long" + ] }, { + "doc": "Column range entry valid/deleted flag", "name": "isDeleted", - "type": "boolean", - "doc": "True if this file has been deleted" + "type": "boolean" } ] } - }] + ], + "default" : null } ] } diff --git a/hudi-common/src/main/avro/HoodieRestorePlan.avsc b/hudi-common/src/main/avro/HoodieRestorePlan.avsc new file mode 100644 index 0000000000000..1ad9e6a4b9c80 --- /dev/null +++ b/hudi-common/src/main/avro/HoodieRestorePlan.avsc @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +{ + "namespace":"org.apache.hudi.avro.model", + "type":"record", + "name":"HoodieRestorePlan", + "fields":[ + { + "name": "instantsToRollback", + "default": [], + "type": { + "type": "array", + "default": null, + "items": "HoodieInstantInfo" + } + }, + { + "name":"version", + "type":["int", "null"], + "default": 1 + }] +} \ No newline at end of file diff --git a/hudi-common/src/main/avro/HoodieRollbackMetadata.avsc b/hudi-common/src/main/avro/HoodieRollbackMetadata.avsc index f342db8738d33..5a300cda9e638 100644 --- a/hudi-common/src/main/avro/HoodieRollbackMetadata.avsc +++ b/hudi-common/src/main/avro/HoodieRollbackMetadata.avsc @@ -38,14 +38,6 @@ "type": "long", "doc": "Size of this file in bytes" } - }], "default":null }, - {"name": "writtenLogFiles", "type": ["null", { - "type": "map", - "doc": "Log files written that were expected to be rolledback", - "values": { - "type": "long", - "doc": "Size of this file in bytes" - } }], "default":null } ] }}}, diff --git a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java new file mode 100644 index 0000000000000..428da925c49ea --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieTableQueryType; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * Common (engine-agnostic) File Index implementation enabling individual query engines to + * list Hudi Table contents based on the + * + *

    + *
  • Table type (MOR, COW)
  • + *
  • Query type (snapshot, read_optimized, incremental)
  • + *
  • Query instant/range
  • + *
+ */ +public abstract class BaseHoodieTableFileIndex { + + private static final Logger LOG = LogManager.getLogger(BaseHoodieTableFileIndex.class); + + private final String[] partitionColumns; + + private final FileSystemViewStorageConfig fileSystemStorageConfig; + private final HoodieMetadataConfig metadataConfig; + + private final HoodieTableQueryType queryType; + private final Option specifiedQueryInstant; + protected final List queryPaths; + + private final boolean shouldIncludePendingCommits; + private final boolean shouldValidateInstant; + + private final HoodieTableType tableType; + protected final String basePath; + + private final HoodieTableMetaClient metaClient; + private final HoodieEngineContext engineContext; + + private final transient FileStatusCache fileStatusCache; + + protected transient volatile long cachedFileSize = 0L; + protected transient volatile Map> cachedAllInputFileSlices; + + protected volatile boolean queryAsNonePartitionedTable = false; + + private transient volatile HoodieTableFileSystemView fileSystemView = null; + + /** + * @param engineContext Hudi engine-specific context + * @param metaClient Hudi table's meta-client + * @param configProperties unifying configuration (in the form of generic properties) + * @param queryType target query type + * @param queryPaths target DFS paths being queried + * @param specifiedQueryInstant instant as of which table is being queried + * @param shouldIncludePendingCommits flags whether file-index should exclude any pending operations + * @param shouldValidateInstant flags to validate whether query instant is present in the timeline + * @param fileStatusCache transient cache of fetched [[FileStatus]]es + */ + public BaseHoodieTableFileIndex(HoodieEngineContext engineContext, + HoodieTableMetaClient metaClient, + TypedProperties configProperties, + HoodieTableQueryType queryType, + List queryPaths, + Option specifiedQueryInstant, + boolean shouldIncludePendingCommits, + boolean shouldValidateInstant, + FileStatusCache fileStatusCache) { + this.partitionColumns = metaClient.getTableConfig().getPartitionFields() + .orElse(new String[0]); + + this.fileSystemStorageConfig = FileSystemViewStorageConfig.newBuilder() + .fromProperties(configProperties) + .build(); + this.metadataConfig = HoodieMetadataConfig.newBuilder() + .fromProperties(configProperties) + .build(); + + this.queryType = queryType; + this.queryPaths = queryPaths; + this.specifiedQueryInstant = specifiedQueryInstant; + this.shouldIncludePendingCommits = shouldIncludePendingCommits; + this.shouldValidateInstant = shouldValidateInstant; + + this.tableType = metaClient.getTableType(); + this.basePath = metaClient.getBasePath(); + + this.metaClient = metaClient; + this.engineContext = engineContext; + this.fileStatusCache = fileStatusCache; + + doRefresh(); + } + + protected abstract Object[] parsePartitionColumnValues(String[] partitionColumns, String partitionPath); + + /** + * Returns latest completed instant as seen by this instance of the file-index + */ + public Option getLatestCompletedInstant() { + return getActiveTimeline().filterCompletedInstants().lastInstant(); + } + + /** + * Returns table's base-path + */ + public String getBasePath() { + return metaClient.getBasePath(); + } + + /** + * Fetch list of latest base files and log files per partition. + * + * @return mapping from string partition paths to its base/log files + */ + public Map> listFileSlices() { + return cachedAllInputFileSlices.entrySet() + .stream() + .collect(Collectors.toMap(e -> e.getKey().path, Map.Entry::getValue)); + } + + protected List getAllQueryPartitionPaths() { + List queryRelativePartitionPaths = queryPaths.stream() + .map(path -> FSUtils.getRelativePartitionPath(new Path(basePath), path)) + .collect(Collectors.toList()); + + // Load all the partition path from the basePath, and filter by the query partition path. + // TODO load files from the queryRelativePartitionPaths directly. + List matchedPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, metadataConfig, basePath) + .stream() + .filter(path -> queryRelativePartitionPaths.stream().anyMatch(path::startsWith)) + .collect(Collectors.toList()); + + // Convert partition's path into partition descriptor + return matchedPartitionPaths.stream() + .map(partitionPath -> { + Object[] partitionColumnValues = parsePartitionColumnValues(partitionColumns, partitionPath); + return new PartitionPath(partitionPath, partitionColumnValues); + }) + .collect(Collectors.toList()); + } + + protected void refresh() { + fileStatusCache.invalidate(); + doRefresh(); + } + + protected HoodieTimeline getActiveTimeline() { + // NOTE: We have to use commits and compactions timeline, to make sure that we're properly + // handling the following case: when records are inserted into the new log-file w/in the file-group + // that is under the pending compaction process, new log-file will bear the compaction's instant (on the + // timeline) in its name, as opposed to the base-file's commit instant. To make sure we're not filtering + // such log-file we have to _always_ include pending compaction instants into consideration + // TODO(HUDI-3302) re-evaluate whether we should not filter any commits in here + HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline(); + if (shouldIncludePendingCommits) { + return timeline; + } else { + return timeline.filterCompletedAndCompactionInstants(); + } + } + + /** + * Load all partition paths and it's files under the query table path. + */ + private Map loadPartitionPathFiles() { + // List files in all partition paths + List pathToFetch = new ArrayList<>(); + Map cachedPartitionToFiles = new HashMap<>(); + + // Fetch from the FileStatusCache + List partitionPaths = getAllQueryPartitionPaths(); + partitionPaths.forEach(partitionPath -> { + Option filesInPartition = fileStatusCache.get(partitionPath.fullPartitionPath(basePath)); + if (filesInPartition.isPresent()) { + cachedPartitionToFiles.put(partitionPath, filesInPartition.get()); + } else { + pathToFetch.add(partitionPath); + } + }); + + Map fetchedPartitionToFiles; + + if (pathToFetch.isEmpty()) { + fetchedPartitionToFiles = Collections.emptyMap(); + } else { + Map fullPartitionPathsMapToFetch = pathToFetch.stream() + .collect(Collectors.toMap( + partitionPath -> partitionPath.fullPartitionPath(basePath).toString(), + Function.identity()) + ); + + fetchedPartitionToFiles = + FSUtils.getFilesInPartitions( + engineContext, + metadataConfig, + basePath, + fullPartitionPathsMapToFetch.keySet().toArray(new String[0]), + fileSystemStorageConfig.getSpillableDir()) + .entrySet() + .stream() + .collect(Collectors.toMap(e -> fullPartitionPathsMapToFetch.get(e.getKey()), e -> e.getValue())); + + } + + // Update the fileStatusCache + fetchedPartitionToFiles.forEach((partitionPath, filesInPartition) -> { + fileStatusCache.put(partitionPath.fullPartitionPath(basePath), filesInPartition); + }); + + return CollectionUtils.combine(cachedPartitionToFiles, fetchedPartitionToFiles); + } + + private void doRefresh() { + long startTime = System.currentTimeMillis(); + + Map partitionFiles = loadPartitionPathFiles(); + FileStatus[] allFiles = partitionFiles.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new); + + metaClient.reloadActiveTimeline(); + + HoodieTimeline activeTimeline = getActiveTimeline(); + Option latestInstant = activeTimeline.lastInstant(); + + // TODO we can optimize the flow by: + // - First fetch list of files from instants of interest + // - Load FileStatus's + fileSystemView = new HoodieTableFileSystemView(metaClient, activeTimeline, allFiles); + + Option queryInstant = + specifiedQueryInstant.or(() -> latestInstant.map(HoodieInstant::getTimestamp)); + + validate(activeTimeline, queryInstant); + + if (tableType.equals(HoodieTableType.MERGE_ON_READ) && queryType.equals(HoodieTableQueryType.SNAPSHOT)) { + cachedAllInputFileSlices = partitionFiles.keySet().stream() + .collect(Collectors.toMap( + Function.identity(), + partitionPath -> + queryInstant.map(instant -> + fileSystemView.getLatestMergedFileSlicesBeforeOrOn(partitionPath.path, queryInstant.get()) + .collect(Collectors.toList()) + ) + .orElse(Collections.emptyList()) + ) + ); + } else { + cachedAllInputFileSlices = partitionFiles.keySet().stream() + .collect(Collectors.toMap( + Function.identity(), + partitionPath -> + queryInstant.map(instant -> + fileSystemView.getLatestFileSlicesBeforeOrOn(partitionPath.path, instant, true) + ) + .orElse(fileSystemView.getLatestFileSlices(partitionPath.path)) + .collect(Collectors.toList()) + ) + ); + } + + cachedFileSize = cachedAllInputFileSlices.values().stream() + .flatMap(Collection::stream) + .mapToLong(BaseHoodieTableFileIndex::fileSliceSize) + .sum(); + + // If the partition value contains InternalRow.empty, we query it as a non-partitioned table. + queryAsNonePartitionedTable = partitionFiles.keySet().stream().anyMatch(p -> p.values.length == 0); + + long duration = System.currentTimeMillis() - startTime; + + LOG.info(String.format("Refresh table %s, spent: %d ms", metaClient.getTableConfig().getTableName(), duration)); + } + + private void validate(HoodieTimeline activeTimeline, Option queryInstant) { + if (shouldValidateInstant) { + if (queryInstant.isPresent() && !activeTimeline.containsInstant(queryInstant.get())) { + throw new HoodieIOException(String.format("Query instant (%s) not found in the timeline", queryInstant.get())); + } + } + } + + private static long fileSliceSize(FileSlice fileSlice) { + long logFileSize = fileSlice.getLogFiles().map(HoodieLogFile::getFileSize) + .filter(s -> s > 0) + .reduce(0L, Long::sum); + + return fileSlice.getBaseFile().map(BaseFile::getFileLen).orElse(0L) + logFileSize; + } + + protected static final class PartitionPath { + final String path; + final Object[] values; + + public PartitionPath(String path, Object[] values) { + this.path = path; + this.values = values; + } + + Path fullPartitionPath(String basePath) { + if (!path.isEmpty()) { + return new Path(basePath, path); + } + + return new Path(basePath); + } + + @Override + public boolean equals(Object other) { + return other instanceof PartitionPath + && Objects.equals(path, ((PartitionPath) other).path) + && Arrays.equals(values, ((PartitionPath) other).values); + } + + @Override + public int hashCode() { + return path.hashCode() * 1103 + Arrays.hashCode(values); + } + } + + protected interface FileStatusCache { + Option get(Path path); + + void put(Path path, FileStatus[] leafFiles); + + void invalidate(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/TypeUtils.java b/hudi-common/src/main/java/org/apache/hudi/TypeUtils.java new file mode 100644 index 0000000000000..6e7d2c87459b5 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/TypeUtils.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi; + +public class TypeUtils { + + /** + * This utility abstracts unsafe type-casting in a way that allows to + *
    + *
  • Search for such type-casts more easily (just searching for usages of this method)
  • + *
  • Avoid type-cast warnings from the compiler
  • + *
+ */ + @SuppressWarnings("unchecked") + public static T unsafeCast(Object o) { + return (T) o; + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java index 9fabc647d7773..209721e24a8d9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java @@ -18,17 +18,7 @@ package org.apache.hudi.avro; -import org.apache.hudi.common.config.SerializableSchema; -import org.apache.hudi.common.model.HoodieOperation; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.exception.SchemaCompatibilityException; - +import org.apache.avro.AvroRuntimeException; import org.apache.avro.Conversions.DecimalConversion; import org.apache.avro.JsonProperties; import org.apache.avro.LogicalTypes; @@ -50,15 +40,22 @@ import org.apache.avro.io.JsonDecoder; import org.apache.avro.io.JsonEncoder; import org.apache.avro.specific.SpecificRecordBase; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieOperation; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.SchemaCompatibilityException; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; import java.math.BigDecimal; import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; import java.sql.Timestamp; import java.time.LocalDate; import java.util.ArrayList; @@ -67,8 +64,6 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import java.util.zip.DeflaterOutputStream; -import java.util.zip.InflaterInputStream; /** * Helper class to do common stuff across Avro. @@ -283,7 +278,7 @@ public static Schema getSchemaForFields(Schema fileSchema, List fields) for (Schema.Field schemaField: fileSchema.getFields()) { if (fields.contains(schemaField.name())) { - toBeAddedFields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), schemaField.defaultValue())); + toBeAddedFields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(), schemaField.defaultVal())); } } recordSchema.setFields(toBeAddedFields); @@ -343,16 +338,26 @@ public static GenericRecord stitchRecords(GenericRecord left, GenericRecord righ } /** - * Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new + * Given an Avro record with a given schema, rewrites it into the new schema while setting fields only from the new * schema. + * + * NOTE: This method is rewriting every record's field that is record itself recursively. It's + * caller's responsibility to make sure that no unnecessary re-writing occurs (by preemptively + * checking whether the record does require re-writing to adhere to the new schema) + * * NOTE: Here, the assumption is that you cannot go from an evolved schema (schema with (N) fields) - * to an older schema (schema with (N-1) fields). All fields present in the older record schema MUST be present in the - * new schema and the default/existing values are carried over. - * This particular method does the following things : - * a) Create a new empty GenericRecord with the new schema. - * b) For GenericRecord, copy over the data from the old schema to the new schema or set default values for all fields of this - * transformed schema - * c) For SpecificRecord, hoodie_metadata_fields have a special treatment. This is done because for code generated + * to an older schema (schema with (N-1) fields). All fields present in the older record schema MUST be present in the + * new schema and the default/existing values are carried over. + * + * This particular method does the following: + *
    + *
  1. Create a new empty GenericRecord with the new schema.
  2. + *
  3. For GenericRecord, copy over the data from the old schema to the new schema or set default values for all + * fields of this transformed schema
  4. + *
  5. For SpecificRecord, hoodie_metadata_fields have a special treatment (see below)
  6. + *
+ * + * For SpecificRecord we ignore Hudi Metadata fields, because for code generated * avro classes (HoodieMetadataRecord), the avro record is a SpecificBaseRecord type instead of a GenericRecord. * SpecificBaseRecord throws null pointer exception for record.get(name) if name is not present in the schema of the * record (which happens when converting a SpecificBaseRecord without hoodie_metadata_fields to a new record with it). @@ -364,58 +369,43 @@ public static GenericRecord rewriteRecord(GenericRecord oldRecord, Schema newSch GenericRecord newRecord = new GenericData.Record(newSchema); boolean isSpecificRecord = oldRecord instanceof SpecificRecordBase; for (Schema.Field f : newSchema.getFields()) { - if (!isSpecificRecord) { - copyOldValueOrSetDefault(oldRecord, newRecord, f); - } else if (!isMetadataField(f.name())) { + if (!(isSpecificRecord && isMetadataField(f.name()))) { copyOldValueOrSetDefault(oldRecord, newRecord, f); } } + if (!GenericData.get().validate(newSchema, newRecord)) { throw new SchemaCompatibilityException( "Unable to validate the rewritten record " + oldRecord + " against schema " + newSchema); } - return newRecord; - } - private static void copyOldValueOrSetDefault(GenericRecord oldRecord, GenericRecord newRecord, Schema.Field f) { - // cache the result of oldRecord.get() to save CPU expensive hash lookup - Schema oldSchema = oldRecord.getSchema(); - Object fieldValue = oldSchema.getField(f.name()) == null ? null : oldRecord.get(f.name()); - if (fieldValue == null) { - if (f.defaultVal() instanceof JsonProperties.Null) { - newRecord.put(f.name(), null); - } else { - newRecord.put(f.name(), f.defaultVal()); - } - } else { - newRecord.put(f.name(), fieldValue); - } + return newRecord; } - public static byte[] compress(String text) { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try { - OutputStream out = new DeflaterOutputStream(baos); - out.write(text.getBytes(StandardCharsets.UTF_8)); - out.close(); - } catch (IOException e) { - throw new HoodieIOException("IOException while compressing text " + text, e); - } - return baos.toByteArray(); + /** + * Converts list of {@link GenericRecord} provided into the {@link GenericRecord} adhering to the + * provided {@code newSchema}. + * + * To better understand conversion rules please check {@link #rewriteRecord(GenericRecord, Schema)} + */ + public static List rewriteRecords(List records, Schema newSchema) { + return records.stream().map(r -> rewriteRecord(r, newSchema)).collect(Collectors.toList()); } - public static String decompress(byte[] bytes) { - InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes)); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try { - byte[] buffer = new byte[8192]; - int len; - while ((len = in.read(buffer)) > 0) { - baos.write(buffer, 0, len); - } - return new String(baos.toByteArray(), StandardCharsets.UTF_8); - } catch (IOException e) { - throw new HoodieIOException("IOException while decompressing text", e); + private static void copyOldValueOrSetDefault(GenericRecord oldRecord, GenericRecord newRecord, Schema.Field field) { + Schema oldSchema = oldRecord.getSchema(); + Object fieldValue = oldSchema.getField(field.name()) == null ? null : oldRecord.get(field.name()); + + if (fieldValue != null) { + // In case field's value is a nested record, we have to rewrite it as well + Object newFieldValue = fieldValue instanceof GenericRecord + ? rewriteRecord((GenericRecord) fieldValue, resolveNullableSchema(field.schema())) + : fieldValue; + newRecord.put(field.name(), newFieldValue); + } else if (field.defaultVal() instanceof JsonProperties.Null) { + newRecord.put(field.name(), null); + } else { + newRecord.put(field.name(), field.defaultVal()); } } @@ -457,23 +447,32 @@ public static Object getNestedFieldVal(GenericRecord record, String fieldName, b String[] parts = fieldName.split("\\."); GenericRecord valueNode = record; int i = 0; - for (; i < parts.length; i++) { - String part = parts[i]; - Object val = valueNode.get(part); - if (val == null) { - break; - } + try { + for (; i < parts.length; i++) { + String part = parts[i]; + Object val = valueNode.get(part); + if (val == null) { + break; + } - // return, if last part of name - if (i == parts.length - 1) { - Schema fieldSchema = valueNode.getSchema().getField(part).schema(); - return convertValueForSpecificDataTypes(fieldSchema, val, consistentLogicalTimestampEnabled); - } else { - // VC: Need a test here - if (!(val instanceof GenericRecord)) { - throw new HoodieException("Cannot find a record at part value :" + part); + // return, if last part of name + if (i == parts.length - 1) { + Schema fieldSchema = valueNode.getSchema().getField(part).schema(); + return convertValueForSpecificDataTypes(fieldSchema, val, consistentLogicalTimestampEnabled); + } else { + // VC: Need a test here + if (!(val instanceof GenericRecord)) { + throw new HoodieException("Cannot find a record at part value :" + part); + } + valueNode = (GenericRecord) val; } - valueNode = (GenericRecord) val; + } + } catch (AvroRuntimeException e) { + // Since avro 1.10, arvo will throw AvroRuntimeException("Not a valid schema field: " + key) + // rather than return null like the previous version if if record doesn't contain this key. + // So when returnNullIfNotFound is true, catch this exception. + if (!returnNullIfNotFound) { + throw e; } } @@ -622,4 +621,24 @@ public static Object getRecordColumnValues(HoodieRecord innerTypes = schema.getTypes(); + Schema nonNullType = + innerTypes.stream() + .filter(it -> it.getType() != Schema.Type.NULL) + .findFirst() + .orElse(null); + + if (innerTypes.size() != 2 || nonNullType == null) { + throw new AvroRuntimeException( + String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema)); + } + + return nonNullType; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java index 3207cfccd80c2..18827c66bf096 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java @@ -18,11 +18,10 @@ package org.apache.hudi.avro; +import org.apache.avro.Schema; import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter; import org.apache.hudi.common.util.Option; - -import org.apache.avro.Schema; import org.apache.parquet.avro.AvroWriteSupport; import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.parquet.schema.MessageType; diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java b/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java index d759a8debf602..15335193414ae 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/MercifulJsonConverter.java @@ -18,15 +18,14 @@ package org.apache.hudi.avro; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; - import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; import java.io.IOException; import java.io.Serializable; @@ -293,7 +292,7 @@ public Pair convert(Object value, String name, Schema schema) { for (Object v : (List) value) { listRes.add(convertJsonToAvroField(v, name, elementSchema)); } - return Pair.of(true, listRes); + return Pair.of(true, new GenericData.Array<>(schema, listRes)); } }; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java index 3e4ee34319c7c..a3191fa026c84 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java @@ -38,16 +38,13 @@ public class HoodieRollbackStat implements Serializable { private final List failedDeleteFiles; // Count of HoodieLogFile to commandBlocks written for a particular rollback private final Map commandBlocksCount; - // all log files with same base instant as instant to be rolledback - private final Map writtenLogFileSizeMap; public HoodieRollbackStat(String partitionPath, List successDeleteFiles, List failedDeleteFiles, - Map commandBlocksCount, Map writtenLogFileSizeMap) { + Map commandBlocksCount) { this.partitionPath = partitionPath; this.successDeleteFiles = successDeleteFiles; this.failedDeleteFiles = failedDeleteFiles; this.commandBlocksCount = commandBlocksCount; - this.writtenLogFileSizeMap = writtenLogFileSizeMap; } public Map getCommandBlocksCount() { @@ -66,10 +63,6 @@ public List getFailedDeleteFiles() { return failedDeleteFiles; } - public Map getWrittenLogFileSizeMap() { - return writtenLogFileSizeMap; - } - public static HoodieRollbackStat.Builder newBuilder() { return new Builder(); } @@ -82,7 +75,6 @@ public static class Builder { private List successDeleteFiles; private List failedDeleteFiles; private Map commandBlocksCount; - private Map writtenLogFileSizeMap; private String partitionPath; public Builder withDeletedFileResults(Map deletedFiles) { @@ -108,11 +100,6 @@ public Builder withRollbackBlockAppendResults(Map commandBlock return this; } - public Builder withWrittenLogFileSizeMap(Map writtenLogFileSizeMap) { - this.writtenLogFileSizeMap = writtenLogFileSizeMap; - return this; - } - public Builder withPartitionPath(String partitionPath) { this.partitionPath = partitionPath; return this; @@ -128,10 +115,7 @@ public HoodieRollbackStat build() { if (commandBlocksCount == null) { commandBlocksCount = Collections.EMPTY_MAP; } - if (writtenLogFileSizeMap == null) { - writtenLogFileSizeMap = Collections.EMPTY_MAP; - } - return new HoodieRollbackStat(partitionPath, successDeleteFiles, failedDeleteFiles, commandBlocksCount, writtenLogFileSizeMap); + return new HoodieRollbackStat(partitionPath, successDeleteFiles, failedDeleteFiles, commandBlocksCount); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/bloom/HoodieDynamicBoundedBloomFilter.java b/hudi-common/src/main/java/org/apache/hudi/common/bloom/HoodieDynamicBoundedBloomFilter.java index 343822b13adec..d4bc287c551c2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/bloom/HoodieDynamicBoundedBloomFilter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/bloom/HoodieDynamicBoundedBloomFilter.java @@ -63,7 +63,7 @@ public class HoodieDynamicBoundedBloomFilter implements BloomFilter { * @param serString the serialized string which represents the {@link HoodieDynamicBoundedBloomFilter} * @param typeCode type code of the bloom filter */ - HoodieDynamicBoundedBloomFilter(String serString, BloomFilterTypeCode typeCode) { + public HoodieDynamicBoundedBloomFilter(String serString, BloomFilterTypeCode typeCode) { // ignoring the type code for now, since we have just one version byte[] bytes = Base64CodecUtil.decode(serString); DataInputStream dis = new DataInputStream(new ByteArrayInputStream(bytes)); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java index 51791c945d589..86ff64177b73a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java @@ -83,20 +83,24 @@ public final class HoodieMetadataConfig extends HoodieConfig { .key(METADATA_PREFIX + ".keep.min.commits") .defaultValue(20) .sinceVersion("0.7.0") - .withDocumentation("Controls the archival of the metadata table’s timeline."); + .withDocumentation("Archiving service moves older entries from metadata table’s timeline " + + "into an archived log after each write, to keep the overhead constant, even as the " + + "metadata table size grows. This config controls the minimum number of instants " + + "to retain in the active timeline."); public static final ConfigProperty MAX_COMMITS_TO_KEEP = ConfigProperty .key(METADATA_PREFIX + ".keep.max.commits") .defaultValue(30) .sinceVersion("0.7.0") - .withDocumentation("Controls the archival of the metadata table’s timeline."); + .withDocumentation("Similar to " + MIN_COMMITS_TO_KEEP.key() + ", this config controls " + + "the maximum number of instants to retain in the active timeline."); // Cleaner commits retained public static final ConfigProperty CLEANER_COMMITS_RETAINED = ConfigProperty .key(METADATA_PREFIX + ".cleaner.commits.retained") .defaultValue(3) .sinceVersion("0.7.0") - .withDocumentation("Controls retention/history for metadata table."); + .withDocumentation("Number of commits to retain, without cleaning, on metadata table."); // Regex to filter out matching directories during bootstrap public static final ConfigProperty DIR_FILTER_REGEX = ConfigProperty @@ -122,20 +126,62 @@ public final class HoodieMetadataConfig extends HoodieConfig { .key(METADATA_PREFIX + ".enable.full.scan.log.files") .defaultValue(true) .sinceVersion("0.10.0") - .withDocumentation("Enable full scanning of log files while reading log records. If disabled, hudi does look up of only interested entries."); + .withDocumentation("Enable full scanning of log files while reading log records. If disabled, Hudi does look up of only interested entries."); + + public static final ConfigProperty ENABLE_METADATA_INDEX_BLOOM_FILTER = ConfigProperty + .key(METADATA_PREFIX + ".index.bloom.filter.enable") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("Enable indexing bloom filters of user data files under metadata table. When enabled, " + + "metadata table will have a partition to store the bloom filter index and will be " + + "used during the index lookups."); + + public static final ConfigProperty METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT = ConfigProperty + .key(METADATA_PREFIX + ".index.bloom.filter.file.group.count") + .defaultValue(4) + .sinceVersion("0.11.0") + .withDocumentation("Metadata bloom filter index partition file group count. This controls the size of the base and " + + "log files and read parallelism in the bloom filter index partition. The recommendation is to size the " + + "file group count such that the base files are under 1GB."); + + public static final ConfigProperty ENABLE_METADATA_INDEX_COLUMN_STATS = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.enable") + .defaultValue(false) + .sinceVersion("0.11.0") + .withDocumentation("Enable indexing column ranges of user data files under metadata table key lookups. When " + + "enabled, metadata table will have a partition to store the column ranges and will be " + + "used for pruning files during the index lookups."); + + public static final ConfigProperty METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.file.group.count") + .defaultValue(2) + .sinceVersion("0.11.0") + .withDocumentation("Metadata column stats partition file group count. This controls the size of the base and " + + "log files and read parallelism in the column stats index partition. The recommendation is to size the " + + "file group count such that the base files are under 1GB."); + + public static final ConfigProperty ENABLE_METADATA_INDEX_COLUMN_STATS_FOR_ALL_COLUMNS = ConfigProperty + .key(METADATA_PREFIX + ".index.column.stats.all_columns.enable") + .defaultValue(true) + .sinceVersion("0.11.0") + .withDocumentation("Enable indexing column ranges of user data files for all columns under " + + "metadata table key lookups. When enabled, metadata table will have a partition to " + + "store the column ranges and will be used for pruning files during the index lookups. " + + "Only applies if " + ENABLE_METADATA_INDEX_COLUMN_STATS.key() + " is enabled."); public static final ConfigProperty POPULATE_META_FIELDS = ConfigProperty .key(METADATA_PREFIX + ".populate.meta.fields") - .defaultValue(true) + .defaultValue(false) .sinceVersion("0.10.0") .withDocumentation("When enabled, populates all meta fields. When disabled, no meta fields are populated."); public static final ConfigProperty IGNORE_SPURIOUS_DELETES = ConfigProperty .key("_" + METADATA_PREFIX + ".ignore.spurious.deletes") .defaultValue(true) - .sinceVersion("0.10.10") - .withDocumentation("There are cases when extra files are requested to be deleted from metadata table which was never added before. This config" - + "determines how to handle such spurious deletes"); + .sinceVersion("0.10.0") + .withDocumentation("There are cases when extra files are requested to be deleted from " + + "metadata table which are never added before. This config determines how to handle " + + "such spurious deletes"); private HoodieMetadataConfig() { super(); @@ -157,6 +203,26 @@ public boolean enabled() { return getBoolean(ENABLE); } + public boolean isBloomFilterIndexEnabled() { + return getBooleanOrDefault(ENABLE_METADATA_INDEX_BLOOM_FILTER); + } + + public boolean isColumnStatsIndexEnabled() { + return getBooleanOrDefault(ENABLE_METADATA_INDEX_COLUMN_STATS); + } + + public boolean isMetadataColumnStatsIndexForAllColumnsEnabled() { + return getBooleanOrDefault(ENABLE_METADATA_INDEX_COLUMN_STATS_FOR_ALL_COLUMNS); + } + + public int getBloomFilterIndexFileGroupCount() { + return getIntOrDefault(METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT); + } + + public int getColumnStatsIndexFileGroupCount() { + return getIntOrDefault(METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT); + } + public boolean enableMetrics() { return getBoolean(METRICS_ENABLE); } @@ -199,6 +265,31 @@ public Builder enable(boolean enable) { return this; } + public Builder withMetadataIndexBloomFilter(boolean enable) { + metadataConfig.setValue(ENABLE_METADATA_INDEX_BLOOM_FILTER, String.valueOf(enable)); + return this; + } + + public Builder withMetadataIndexBloomFilterFileGroups(int fileGroupCount) { + metadataConfig.setValue(METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT, String.valueOf(fileGroupCount)); + return this; + } + + public Builder withMetadataIndexColumnStats(boolean enable) { + metadataConfig.setValue(ENABLE_METADATA_INDEX_COLUMN_STATS, String.valueOf(enable)); + return this; + } + + public Builder withMetadataIndexColumnStatsFileGroupCount(int fileGroupCount) { + metadataConfig.setValue(METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT, String.valueOf(fileGroupCount)); + return this; + } + + public Builder withMetadataIndexForAllColumns(boolean enable) { + metadataConfig.setValue(ENABLE_METADATA_INDEX_COLUMN_STATS_FOR_ALL_COLUMNS, String.valueOf(enable)); + return this; + } + public Builder enableMetrics(boolean enableMetrics) { metadataConfig.setValue(METRICS_ENABLE, String.valueOf(enableMetrics)); return this; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/TypedProperties.java b/hudi-common/src/main/java/org/apache/hudi/common/config/TypedProperties.java index 49db9b23ae706..6639e88d56f3f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/TypedProperties.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/TypedProperties.java @@ -20,7 +20,12 @@ import java.io.Serializable; import java.util.Arrays; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Properties; import java.util.Set; @@ -31,6 +36,8 @@ */ public class TypedProperties extends Properties implements Serializable { + private final HashSet keys = new LinkedHashSet<>(); + public TypedProperties() { super(null); } @@ -43,15 +50,60 @@ public TypedProperties(Properties defaults) { } } - private void checkKey(String property) { - if (!keyExists(property)) { - throw new IllegalArgumentException("Property " + property + " not found"); + @Override + public Enumeration propertyNames() { + return Collections.enumeration(keys); + } + + @Override + public synchronized Enumeration keys() { + return Collections.enumeration(keys); + } + + @Override + public Set stringPropertyNames() { + Set set = new LinkedHashSet<>(); + for (Object key : this.keys) { + if (key instanceof String) { + set.add((String) key); + } + } + return set; + } + + public synchronized void putAll(Properties t) { + for (Map.Entry e : t.entrySet()) { + if (!containsKey(String.valueOf(e.getKey()))) { + keys.add(e.getKey()); + } + super.put(e.getKey(), e.getValue()); + } + } + + @Override + public synchronized Object put(Object key, Object value) { + keys.remove(key); + keys.add(key); + return super.put(key, value); + } + + public synchronized Object putIfAbsent(Object key, Object value) { + if (!containsKey(String.valueOf(key))) { + keys.add(key); } + return super.putIfAbsent(key, value); } - private boolean keyExists(String property) { - Set keys = super.stringPropertyNames(); - return keys.contains(property); + @Override + public Object remove(Object key) { + keys.remove(key); + return super.remove(key); + } + + private void checkKey(String property) { + if (!containsKey(property)) { + throw new IllegalArgumentException("Property " + property + " not found"); + } } public String getString(String property) { @@ -60,11 +112,11 @@ public String getString(String property) { } public String getString(String property, String defaultValue) { - return keyExists(property) ? getProperty(property) : defaultValue; + return containsKey(property) ? getProperty(property) : defaultValue; } public List getStringList(String property, String delimiter, List defaultVal) { - if (!keyExists(property)) { + if (!containsKey(property)) { return defaultVal; } return Arrays.stream(getProperty(property).split(delimiter)).map(String::trim).collect(Collectors.toList()); @@ -76,7 +128,7 @@ public int getInteger(String property) { } public int getInteger(String property, int defaultValue) { - return keyExists(property) ? Integer.parseInt(getProperty(property)) : defaultValue; + return containsKey(property) ? Integer.parseInt(getProperty(property)) : defaultValue; } public long getLong(String property) { @@ -85,7 +137,7 @@ public long getLong(String property) { } public long getLong(String property, long defaultValue) { - return keyExists(property) ? Long.parseLong(getProperty(property)) : defaultValue; + return containsKey(property) ? Long.parseLong(getProperty(property)) : defaultValue; } public boolean getBoolean(String property) { @@ -94,7 +146,7 @@ public boolean getBoolean(String property) { } public boolean getBoolean(String property, boolean defaultValue) { - return keyExists(property) ? Boolean.parseBoolean(getProperty(property)) : defaultValue; + return containsKey(property) ? Boolean.parseBoolean(getProperty(property)) : defaultValue; } public double getDouble(String property) { @@ -103,6 +155,6 @@ public double getDouble(String property) { } public double getDouble(String property, double defaultValue) { - return keyExists(property) ? Double.parseDouble(getProperty(property)) : defaultValue; + return containsKey(property) ? Double.parseDouble(getProperty(property)) : defaultValue; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index d7af8a7d46d8b..7c9b7cc806fa4 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -135,6 +135,17 @@ public static Path addSchemeIfLocalPath(String path) { return providedPath; } + /** + * Makes path qualified w/ {@link FileSystem}'s URI + * + * @param fs instance of {@link FileSystem} path belongs to + * @param path path to be qualified + * @return qualified path, prefixed w/ the URI of the target FS object provided + */ + public static Path makeQualified(FileSystem fs, Path path) { + return path.makeQualified(fs.getUri(), fs.getWorkingDirectory()); + } + /** * A write token uniquely identifies an attempt at one of the IOHandle operations (Merge/Create/Append). */ @@ -484,24 +495,25 @@ public static FileStatus[] getAllDataFilesInPartition(FileSystem fs, Path partit } /** - * Get the latest log file written from the list of log files passed in. + * Get the latest log file for the passed in file-id in the partition path */ - public static Option getLatestLogFile(Stream logFiles) { - return Option.fromJavaOptional(logFiles.min(HoodieLogFile.getReverseLogFileComparator())); + public static Option getLatestLogFile(FileSystem fs, Path partitionPath, String fileId, + String logFileExtension, String baseCommitTime) throws IOException { + return getLatestLogFile(getAllLogFiles(fs, partitionPath, fileId, logFileExtension, baseCommitTime)); } /** - * Get all the log files for the passed in FileId in the partition path. + * Get all the log files for the passed in file-id in the partition path. */ public static Stream getAllLogFiles(FileSystem fs, Path partitionPath, final String fileId, final String logFileExtension, final String baseCommitTime) throws IOException { try { - return Arrays - .stream(fs.listStatus(partitionPath, - path -> path.getName().startsWith("." + fileId) && path.getName().contains(logFileExtension))) - .map(HoodieLogFile::new).filter(s -> s.getBaseCommitTime().equals(baseCommitTime)); + PathFilter pathFilter = path -> path.getName().startsWith("." + fileId) && path.getName().contains(logFileExtension); + return Arrays.stream(fs.listStatus(partitionPath, pathFilter)) + .map(HoodieLogFile::new) + .filter(s -> s.getBaseCommitTime().equals(baseCommitTime)); } catch (FileNotFoundException e) { - return Stream.builder().build(); + return Stream.of(); } } @@ -776,4 +788,8 @@ public static List getFileStatusAtLevel( public interface SerializableFunction extends Function, Serializable { } + + private static Option getLatestLogFile(Stream logFiles) { + return Option.fromJavaOptional(logFiles.min(HoodieLogFile.getReverseLogFileComparator())); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FileSystemRetryConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FileSystemRetryConfig.java new file mode 100644 index 0000000000000..c7f99ece7e45d --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FileSystemRetryConfig.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * The file system retry relevant config options. + */ +@ConfigClassProperty(name = "FileSystem Guard Configurations", + groupName = ConfigGroups.Names.WRITE_CLIENT, + description = "The filesystem retry related config options, to help deal with runtime exception like list/get/put/delete performance issues.") +public class FileSystemRetryConfig extends HoodieConfig { + + public static final ConfigProperty FILESYSTEM_RETRY_ENABLE = ConfigProperty + .key("hoodie.filesystem.operation.retry.enable") + .defaultValue("false") + .sinceVersion("0.11.0") + .withDocumentation("Enabled to handle list/get/delete etc file system performance issue."); + + public static final ConfigProperty INITIAL_RETRY_INTERVAL_MS = ConfigProperty + .key("hoodie.filesystem.operation.retry.initial_interval_ms") + .defaultValue(100L) + .sinceVersion("0.11.0") + .withDocumentation("Amount of time (in ms) to wait, before retry to do operations on storage."); + + public static final ConfigProperty MAX_RETRY_INTERVAL_MS = ConfigProperty + .key("hoodie.filesystem.operation.retry.max_interval_ms") + .defaultValue(2000L) + .sinceVersion("0.11.0") + .withDocumentation("Maximum amount of time (in ms), to wait for next retry."); + + public static final ConfigProperty MAX_RETRY_NUMBERS = ConfigProperty + .key("hoodie.filesystem.operation.retry.max_numbers") + .defaultValue(4) + .sinceVersion("0.11.0") + .withDocumentation("Maximum number of retry actions to perform, with exponential backoff."); + + public static final ConfigProperty RETRY_EXCEPTIONS = ConfigProperty + .key("hoodie.filesystem.operation.retry.exceptions") + .defaultValue("") + .sinceVersion("0.11.0") + .withDocumentation("The class name of the Exception that needs to be re-tryed, separated by commas. " + + "Default is empty which means retry all the IOException and RuntimeException from FileSystem"); + + private FileSystemRetryConfig() { + super(); + } + + public long getInitialRetryIntervalMs() { + return getLong(INITIAL_RETRY_INTERVAL_MS); + } + + public long getMaxRetryIntervalMs() { + return getLong(MAX_RETRY_INTERVAL_MS); + } + + public int getMaxRetryNumbers() { + return getInt(MAX_RETRY_NUMBERS); + } + + public boolean isFileSystemActionRetryEnable() { + return Boolean.parseBoolean(getStringOrDefault(FILESYSTEM_RETRY_ENABLE)); + } + + public static FileSystemRetryConfig.Builder newBuilder() { + return new Builder(); + } + + public String getRetryExceptions() { + return getString(RETRY_EXCEPTIONS); + } + + /** + * The builder used to build filesystem configurations. + */ + public static class Builder { + + private final FileSystemRetryConfig fileSystemRetryConfig = new FileSystemRetryConfig(); + + public Builder fromFile(File propertiesFile) throws IOException { + try (FileReader reader = new FileReader(propertiesFile)) { + fileSystemRetryConfig.getProps().load(reader); + return this; + } + } + + public Builder fromProperties(Properties props) { + this.fileSystemRetryConfig.getProps().putAll(props); + return this; + } + + public Builder withMaxRetryNumbers(int numbers) { + fileSystemRetryConfig.setValue(MAX_RETRY_NUMBERS, String.valueOf(numbers)); + return this; + } + + public Builder withInitialRetryIntervalMs(long intervalMs) { + fileSystemRetryConfig.setValue(INITIAL_RETRY_INTERVAL_MS, String.valueOf(intervalMs)); + return this; + } + + public Builder withMaxRetryIntervalMs(long intervalMs) { + fileSystemRetryConfig.setValue(MAX_RETRY_INTERVAL_MS, String.valueOf(intervalMs)); + return this; + } + + public Builder withFileSystemActionRetryEnabled(boolean enabled) { + fileSystemRetryConfig.setValue(FILESYSTEM_RETRY_ENABLE, String.valueOf(enabled)); + return this; + } + + public FileSystemRetryConfig build() { + fileSystemRetryConfig.setDefaults(FileSystemRetryConfig.class.getName()); + return fileSystemRetryConfig; + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieRetryWrapperFileSystem.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieRetryWrapperFileSystem.java new file mode 100644 index 0000000000000..075f811a42ea7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieRetryWrapperFileSystem.java @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CreateFlag; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Options; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.util.Progressable; +import org.apache.hudi.common.util.RetryHelper; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.util.EnumSet; + +public class HoodieRetryWrapperFileSystem extends FileSystem { + + private FileSystem fileSystem; + private long maxRetryIntervalMs; + private int maxRetryNumbers; + private long initialRetryIntervalMs; + private String retryExceptionsList; + + public HoodieRetryWrapperFileSystem(FileSystem fs, long maxRetryIntervalMs, int maxRetryNumbers, long initialRetryIntervalMs, String retryExceptions) { + this.fileSystem = fs; + this.maxRetryIntervalMs = maxRetryIntervalMs; + this.maxRetryNumbers = maxRetryNumbers; + this.initialRetryIntervalMs = initialRetryIntervalMs; + this.retryExceptionsList = retryExceptions; + + } + + @Override + public URI getUri() { + return fileSystem.getUri(); + } + + @Override + public FSDataInputStream open(Path f, int bufferSize) throws IOException { + return (FSDataInputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.open(f, bufferSize)).start(); + } + + @Override + public FSDataInputStream open(Path f) throws IOException { + return (FSDataInputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.open(f)).start(); + } + + @Override + public FSDataOutputStream create(Path f, + FsPermission permission, + boolean overwrite, + int bufferSize, + short replication, + long blockSize, + Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, permission, overwrite, bufferSize, replication, blockSize, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f, overwrite)).start(); + } + + @Override + public FSDataOutputStream create(Path f) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f)).start(); + } + + @Override + public FSDataOutputStream create(Path f, Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, short replication) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f, replication)).start(); + } + + @Override + public FSDataOutputStream create(Path f, short replication, Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.create(f, replication, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, overwrite, bufferSize)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, Progressable progress) + throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, overwrite, bufferSize, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize, + Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, overwrite, bufferSize, replication, blockSize, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, int bufferSize, + short replication, long blockSize, Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, permission, flags, bufferSize, replication, blockSize, progress)).start(); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, int bufferSize, + short replication, long blockSize, Progressable progress, Options.ChecksumOpt checksumOpt) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, permission, flags, bufferSize, replication, + blockSize, progress, checksumOpt)).start(); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, long blockSize) + throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.create(f, overwrite, bufferSize, replication, blockSize)).start(); + } + + @Override + public boolean createNewFile(Path f) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.createNewFile(f)).start(); + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.append(f, bufferSize, progress)).start(); + } + + @Override + public FSDataOutputStream append(Path f) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.append(f)).start(); + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize) throws IOException { + return (FSDataOutputStream) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.append(f, bufferSize)).start(); + } + + @Override + public boolean rename(Path src, Path dst) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.rename(src, dst)).start(); + } + + @Override + public boolean delete(Path f, boolean recursive) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.delete(f, recursive)).start(); + } + + @Override + public boolean delete(Path f) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.delete(f, true)).start(); + } + + @Override + public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listStatus(f)).start(); + } + + @Override + public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listStatus(f, filter)).start(); + } + + @Override + public FileStatus[] listStatus(Path[] files) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listStatus(files)).start(); + } + + @Override + public FileStatus[] listStatus(Path[] files, PathFilter filter) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listStatus(files, filter)).start(); + } + + @Override + public FileStatus[] globStatus(Path pathPattern) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.globStatus(pathPattern)).start(); + } + + @Override + public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException { + return (FileStatus[]) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.globStatus(pathPattern, filter)).start(); + } + + @Override + public RemoteIterator listLocatedStatus(Path f) throws IOException { + return (RemoteIterator) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.listLocatedStatus(f)).start(); + } + + @Override + public RemoteIterator listFiles(Path f, boolean recursive) throws IOException { + return (RemoteIterator) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList) + .tryWith(() -> fileSystem.listFiles(f, recursive)).start(); + } + + @Override + public void setWorkingDirectory(Path newDir) { + fileSystem.setWorkingDirectory(newDir); + } + + @Override + public Path getWorkingDirectory() { + return fileSystem.getWorkingDirectory(); + } + + @Override + public boolean mkdirs(Path f, FsPermission permission) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.mkdirs(f, permission)).start(); + } + + @Override + public FileStatus getFileStatus(Path f) throws IOException { + return (FileStatus) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.getFileStatus(f)).start(); + } + + @Override + public boolean exists(Path f) throws IOException { + return (boolean) new RetryHelper(maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, retryExceptionsList).tryWith(() -> fileSystem.exists(f)).start(); + } + + @Override + public Configuration getConf() { + return fileSystem.getConf(); + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java index 8521fd8205808..4bbd94384420d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/HoodieWrapperFileSystem.java @@ -136,7 +136,7 @@ public static Path convertToHoodiePath(Path file, Configuration conf) { } } - private static Path convertPathWithScheme(Path oldPath, String newScheme) { + public static Path convertPathWithScheme(Path oldPath, String newScheme) { URI oldURI = oldPath.toUri(); URI newURI; try { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFSUtils.java index a2c60bc318e4b..080f228f161e9 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFSUtils.java @@ -19,10 +19,11 @@ package org.apache.hudi.common.fs.inline; import org.apache.hadoop.fs.Path; -import org.apache.hudi.common.util.ValidationUtils; import java.io.File; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; + /** * Utils to parse InLineFileSystem paths. * Inline FS format: @@ -61,10 +62,10 @@ public static Path getInlineFilePath(Path outerPath, String origScheme, long inL /** * InlineFS Path format: - * "inlinefs://path/to/outer/file/outer_file_schema/?start_offset=start_offset>&length=" + * "inlinefs://path/to/outer/file/outer_file_scheme/?start_offset=start_offset>&length=" *

* Outer File Path format: - * "outer_file_schema://path/to/outer/file" + * "outer_file_scheme://path/to/outer/file" *

* Example * Input: "inlinefs://file1/s3a/?start_offset=20&length=40". @@ -74,40 +75,48 @@ public static Path getInlineFilePath(Path outerPath, String origScheme, long inL * @return Outer file Path from the InLineFS Path */ public static Path getOuterFilePathFromInlinePath(Path inlineFSPath) { - final String scheme = inlineFSPath.getParent().getName(); + assertInlineFSPath(inlineFSPath); + + final String outerFileScheme = inlineFSPath.getParent().getName(); final Path basePath = inlineFSPath.getParent().getParent(); - ValidationUtils.checkArgument(basePath.toString().contains(SCHEME_SEPARATOR), - "Invalid InLineFSPath: " + inlineFSPath); + checkArgument(basePath.toString().contains(SCHEME_SEPARATOR), + "Invalid InLineFS path: " + inlineFSPath); final String pathExceptScheme = basePath.toString().substring(basePath.toString().indexOf(SCHEME_SEPARATOR) + 1); - final String fullPath = scheme + SCHEME_SEPARATOR - + (scheme.equals(LOCAL_FILESYSTEM_SCHEME) ? PATH_SEPARATOR : "") + final String fullPath = outerFileScheme + SCHEME_SEPARATOR + + (outerFileScheme.equals(LOCAL_FILESYSTEM_SCHEME) ? PATH_SEPARATOR : "") + pathExceptScheme; return new Path(fullPath); } /** - * Eg input : "inlinefs://file1/s3a/?start_offset=20&length=40". - * output: 20 + * Returns start offset w/in the base for the block identified by the given InlineFS path * - * @param inlinePath - * @return + * input: "inlinefs://file1/s3a/?start_offset=20&length=40". + * output: 20 */ - public static int startOffset(Path inlinePath) { - String[] slices = inlinePath.toString().split("[?&=]"); + public static int startOffset(Path inlineFSPath) { + assertInlineFSPath(inlineFSPath); + + String[] slices = inlineFSPath.toString().split("[?&=]"); return Integer.parseInt(slices[slices.length - 3]); } /** - * Eg input : "inlinefs:/file1/s3a/?start_offset=20&length=40". - * Output: 40 + * Returns length of the block (embedded w/in the base file) identified by the given InlineFS path * - * @param inlinePath - * @return + * input: "inlinefs:/file1/s3a/?start_offset=20&length=40". + * output: 40 */ public static int length(Path inlinePath) { + assertInlineFSPath(inlinePath); + String[] slices = inlinePath.toString().split("[?&=]"); return Integer.parseInt(slices[slices.length - 1]); } + private static void assertInlineFSPath(Path inlinePath) { + String scheme = inlinePath.toUri().getScheme(); + checkArgument(InLineFileSystem.SCHEME.equals(scheme)); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFileSystem.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFileSystem.java index 712b6c7ff4e32..1b2ea3cbedcf5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFileSystem.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/inline/InLineFileSystem.java @@ -57,6 +57,7 @@ public URI getUri() { return URI.create(getScheme()); } + @Override public String getScheme() { return SCHEME; } @@ -129,5 +130,4 @@ public Path getWorkingDirectory() { public boolean mkdirs(Path path, FsPermission fsPermission) throws IOException { throw new UnsupportedOperationException("Can't set working directory"); } - } \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java index 3e7971b1b26f1..5e4b445dfc85e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/DefaultHoodieRecordPayload.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.model; +import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; @@ -30,9 +31,6 @@ import java.util.Map; import java.util.Properties; -import static org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro; -import static org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldVal; - /** * {@link HoodieRecordPayload} impl that honors ordering field in both preCombine and combineAndGetUpdateValue. *

@@ -57,7 +55,7 @@ public Option combineAndGetUpdateValue(IndexedRecord currentValue return Option.empty(); } - GenericRecord incomingRecord = bytesToAvro(recordBytes, schema); + GenericRecord incomingRecord = HoodieAvroUtils.bytesToAvro(recordBytes, schema); // Null check is needed here to support schema evolution. The record in storage may be from old schema where // the new ordering column might not be present and hence returns null. @@ -81,7 +79,7 @@ public Option getInsertValue(Schema schema, Properties properties if (recordBytes.length == 0) { return Option.empty(); } - GenericRecord incomingRecord = bytesToAvro(recordBytes, schema); + GenericRecord incomingRecord = HoodieAvroUtils.bytesToAvro(recordBytes, schema); eventTime = updateEventTime(incomingRecord, properties); return isDeleteRecord(incomingRecord) ? Option.empty() : Option.of(incomingRecord); @@ -91,7 +89,13 @@ private static Option updateEventTime(GenericRecord record, Properties p boolean consistentLogicalTimestampEnabled = Boolean.parseBoolean(properties.getProperty( KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())); - return Option.ofNullable(getNestedFieldVal(record, properties.getProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY), true, consistentLogicalTimestampEnabled)); + return Option.ofNullable( + HoodieAvroUtils.getNestedFieldVal( + record, + properties.getProperty(HoodiePayloadProps.PAYLOAD_EVENT_TIME_FIELD_PROP_KEY), + true, + consistentLogicalTimestampEnabled) + ); } @Override @@ -117,10 +121,13 @@ protected boolean needUpdatingPersistedRecord(IndexedRecord currentValue, boolean consistentLogicalTimestampEnabled = Boolean.parseBoolean(properties.getProperty( KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())); - Object persistedOrderingVal = getNestedFieldVal((GenericRecord) currentValue, - properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY), true, consistentLogicalTimestampEnabled); - Comparable incomingOrderingVal = (Comparable) getNestedFieldVal((GenericRecord) incomingRecord, - properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY), true, consistentLogicalTimestampEnabled); + Object persistedOrderingVal = HoodieAvroUtils.getNestedFieldVal((GenericRecord) currentValue, + properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY), + true, consistentLogicalTimestampEnabled); + Comparable incomingOrderingVal = (Comparable) HoodieAvroUtils.getNestedFieldVal((GenericRecord) incomingRecord, + properties.getProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY), + true, consistentLogicalTimestampEnabled); return persistedOrderingVal == null || ((Comparable) persistedOrderingVal).compareTo(incomingOrderingVal) <= 0; } + } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java new file mode 100644 index 0000000000000..9a9bbb2b7427f --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.model; + +public class HoodieAvroRecord extends HoodieRecord { + public HoodieAvroRecord(HoodieKey key, T data) { + super(key, data); + } + + public HoodieAvroRecord(HoodieKey key, T data, HoodieOperation operation) { + super(key, data, operation); + } + + public HoodieAvroRecord(HoodieRecord record) { + super(record); + } + + public HoodieAvroRecord() { + } + + @Override + public HoodieRecord newInstance() { + return new HoodieAvroRecord<>(this); + } + + @Override + public T getData() { + if (data == null) { + throw new IllegalStateException("Payload already deflated for record."); + } + return data; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java index 647232fb7e3a9..58b9f7475a35f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCleaningPolicy.java @@ -22,5 +22,5 @@ * Hoodie cleaning policies. */ public enum HoodieCleaningPolicy { - KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_COMMITS; + KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_COMMITS, KEEP_LATEST_BY_HOURS; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java index ca977ae53b5f9..acf5b2298987a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieColumnRangeMetadata.java @@ -18,8 +18,6 @@ package org.apache.hudi.common.model; -import org.apache.parquet.schema.PrimitiveStringifier; - import java.util.Objects; /** @@ -30,16 +28,21 @@ public class HoodieColumnRangeMetadata { private final String columnName; private final T minValue; private final T maxValue; - private final long numNulls; - private final PrimitiveStringifier stringifier; + private final long nullCount; + private final long valueCount; + private final long totalSize; + private final long totalUncompressedSize; - public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, final long numNulls, final PrimitiveStringifier stringifier) { + public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, + final long nullCount, long valueCount, long totalSize, long totalUncompressedSize) { this.filePath = filePath; this.columnName = columnName; this.minValue = minValue; this.maxValue = maxValue; - this.numNulls = numNulls; - this.stringifier = stringifier; + this.nullCount = nullCount; + this.valueCount = valueCount; + this.totalSize = totalSize; + this.totalUncompressedSize = totalUncompressedSize; } public String getFilePath() { @@ -58,12 +61,20 @@ public T getMaxValue() { return this.maxValue; } - public PrimitiveStringifier getStringifier() { - return stringifier; + public long getNullCount() { + return nullCount; + } + + public long getValueCount() { + return valueCount; + } + + public long getTotalSize() { + return totalSize; } - public long getNumNulls() { - return numNulls; + public long getTotalUncompressedSize() { + return totalUncompressedSize; } @Override @@ -79,21 +90,28 @@ public boolean equals(final Object o) { && Objects.equals(getColumnName(), that.getColumnName()) && Objects.equals(getMinValue(), that.getMinValue()) && Objects.equals(getMaxValue(), that.getMaxValue()) - && Objects.equals(getNumNulls(), that.getNumNulls()); + && Objects.equals(getNullCount(), that.getNullCount()) + && Objects.equals(getValueCount(), that.getValueCount()) + && Objects.equals(getTotalSize(), that.getTotalSize()) + && Objects.equals(getTotalUncompressedSize(), that.getTotalUncompressedSize()); } @Override public int hashCode() { - return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNumNulls()); + return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNullCount()); } @Override public String toString() { return "HoodieColumnRangeMetadata{" + "filePath ='" + filePath + '\'' - + "columnName='" + columnName + '\'' + + ", columnName='" + columnName + '\'' + ", minValue=" + minValue + ", maxValue=" + maxValue - + ", numNulls=" + numNulls + '}'; + + ", nullCount=" + nullCount + + ", valueCount=" + valueCount + + ", totalSize=" + totalSize + + ", totalUncompressedSize=" + totalUncompressedSize + + '}'; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java index c1e8cbf08b11c..c57965d727210 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieCommitMetadata.java @@ -18,17 +18,17 @@ package org.apache.hudi.common.model; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.collection.Pair; - import com.fasterxml.jackson.annotation.JsonAutoDetect; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.PropertyAccessor; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -36,10 +36,12 @@ import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; /** * All the metadata that gets stored along with a commit. @@ -90,6 +92,10 @@ public Map> getPartitionToWriteStats() { return partitionToWriteStats; } + public List getWriteStats() { + return partitionToWriteStats.values().stream().flatMap(Collection::stream).collect(Collectors.toList()); + } + public String getMetadata(String metaKey) { return extraMetadata.get(metaKey); } @@ -148,10 +154,12 @@ public Map getFileGroupIdAndFullPaths(String basePath * been touched multiple times in the given commits, the return value will keep the one * from the latest commit. * + * + * @param hadoopConf * @param basePath The base path * @return the file full path to file status mapping */ - public Map getFullPathToFileStatus(String basePath) { + public Map getFullPathToFileStatus(Configuration hadoopConf, String basePath) { Map fullPathToFileStatus = new HashMap<>(); for (List stats : getPartitionToWriteStats().values()) { // Iterate through all the written files. @@ -159,7 +167,8 @@ public Map getFullPathToFileStatus(String basePath) { String relativeFilePath = stat.getPath(); Path fullPath = relativeFilePath != null ? FSUtils.getPartitionPath(basePath, relativeFilePath) : null; if (fullPath != null) { - FileStatus fileStatus = new FileStatus(stat.getFileSizeInBytes(), false, 0, 0, + long blockSize = FSUtils.getFs(fullPath.toString(), hadoopConf).getDefaultBlockSize(fullPath); + FileStatus fileStatus = new FileStatus(stat.getFileSizeInBytes(), false, 0, blockSize, 0, fullPath); fullPathToFileStatus.put(fullPath.getName(), fileStatus); } @@ -173,14 +182,16 @@ public Map getFullPathToFileStatus(String basePath) { * been touched multiple times in the given commits, the return value will keep the one * from the latest commit by file group ID. * - *

Note: different with {@link #getFullPathToFileStatus(String)}, + *

Note: different with {@link #getFullPathToFileStatus(Configuration, String)}, * only the latest commit file for a file group is returned, * this is an optimization for COPY_ON_WRITE table to eliminate legacy files for filesystem view. * + * + * @param hadoopConf * @param basePath The base path * @return the file ID to file status mapping */ - public Map getFileIdToFileStatus(String basePath) { + public Map getFileIdToFileStatus(Configuration hadoopConf, String basePath) { Map fileIdToFileStatus = new HashMap<>(); for (List stats : getPartitionToWriteStats().values()) { // Iterate through all the written files. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java index 2515659c7b5fd..5b5a6432e633c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java @@ -18,11 +18,10 @@ package org.apache.hudi.common.model; -import org.apache.hudi.common.fs.FSUtils; - import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; import java.io.IOException; import java.io.Serializable; @@ -60,7 +59,7 @@ public HoodieLogFile(FileStatus fileStatus) { public HoodieLogFile(Path logPath) { this.fileStatus = null; this.pathStr = logPath.toString(); - this.fileLen = 0; + this.fileLen = -1; } public HoodieLogFile(Path logPath, Long fileLen) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java index 17427781eabb3..ac30766dd2f03 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecord.java @@ -18,21 +18,21 @@ package org.apache.hudi.common.model; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; import java.io.Serializable; import java.util.List; +import java.util.Map; import java.util.Objects; -import org.apache.hudi.common.util.collection.Pair; +import java.util.stream.Collectors; +import java.util.stream.IntStream; /** * A Single Record managed by Hoodie. */ -public class HoodieRecord implements Serializable { +public abstract class HoodieRecord implements Serializable { public static final String COMMIT_TIME_METADATA_FIELD = "_hoodie_commit_time"; public static final String COMMIT_SEQNO_METADATA_FIELD = "_hoodie_commit_seqno"; @@ -40,6 +40,7 @@ public class HoodieRecord implements Serializable public static final String PARTITION_PATH_METADATA_FIELD = "_hoodie_partition_path"; public static final String FILENAME_METADATA_FIELD = "_hoodie_file_name"; public static final String OPERATION_METADATA_FIELD = "_hoodie_operation"; + public static final String HOODIE_IS_DELETED = "_hoodie_is_deleted"; public static final List HOODIE_META_COLUMNS = CollectionUtils.createImmutableList(COMMIT_TIME_METADATA_FIELD, COMMIT_SEQNO_METADATA_FIELD, @@ -64,7 +65,7 @@ public class HoodieRecord implements Serializable /** * Actual payload of the record. */ - private T data; + protected T data; /** * Current location of record on storage. Filled in by looking up index @@ -110,6 +111,8 @@ public HoodieRecord(HoodieRecord record) { public HoodieRecord() { } + public abstract HoodieRecord newInstance(); + public HoodieKey getKey() { return key; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableQueryType.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableQueryType.java index 15449b32959e3..f1d7557ae22f8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableQueryType.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieTableQueryType.java @@ -30,7 +30,7 @@ * */ public enum HoodieTableQueryType { - QUERY_TYPE_SNAPSHOT, - QUERY_TYPE_INCREMENTAL, - QUERY_TYPE_READ_OPTIMIZED + SNAPSHOT, + INCREMENTAL, + READ_OPTIMIZED } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java index 4be2e3e093e90..7b7bd6c6b2e5e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java @@ -85,7 +85,7 @@ public Option getInsertValue(Schema schema) throws IOException { * @returns {@code true} if record represents a delete record. {@code false} otherwise. */ protected boolean isDeleteRecord(GenericRecord genericRecord) { - final String isDeleteKey = "_hoodie_is_deleted"; + final String isDeleteKey = HoodieRecord.HOODIE_IS_DELETED; // Modify to be compatible with new version Avro. // The new version Avro throws for GenericRecord.get if the field name // does not exist in the schema. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/TableServiceType.java b/hudi-common/src/main/java/org/apache/hudi/common/model/TableServiceType.java index 90444a3d61aa2..69dd30782ff77 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/TableServiceType.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/TableServiceType.java @@ -24,10 +24,13 @@ * Supported runtime table services. */ public enum TableServiceType { - COMPACT, CLUSTER, CLEAN; + ARCHIVE, COMPACT, CLUSTER, CLEAN; public String getAction() { switch (this) { + case ARCHIVE: + // for table service type completeness; there is no timeline action associated with archive + return "NONE"; case COMPACT: return HoodieTimeline.COMPACTION_ACTION; case CLEAN: diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java index 448627d97cbf7..d4be1899a1c96 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java @@ -31,6 +31,7 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.List; +import java.util.Properties; /** * Provides support for seamlessly applying changes captured via Debezium for PostgresDB. @@ -71,6 +72,19 @@ protected boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRe return insertSourceLSN < currentSourceLSN; } + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) throws IOException { + // Specific to Postgres: If the updated record has TOASTED columns, + // we will need to keep the previous value for those columns + // see https://debezium.io/documentation/reference/connectors/postgresql.html#postgresql-toasted-values + Option insertOrDeleteRecord = super.combineAndGetUpdateValue(currentValue, schema, properties); + + if (insertOrDeleteRecord.isPresent()) { + mergeToastedValuesIfPresent(insertOrDeleteRecord.get(), currentValue); + } + return insertOrDeleteRecord; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { // Specific to Postgres: If the updated record has TOASTED columns, diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java index 624c02726c528..dc010366cd3b5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableConfig.java @@ -18,12 +18,20 @@ package org.apache.hudi.common.table; +import org.apache.avro.Schema; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex; import org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex; import org.apache.hudi.common.config.ConfigClassProperty; import org.apache.hudi.common.config.ConfigGroups; import org.apache.hudi.common.config.ConfigProperty; import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -31,29 +39,29 @@ import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.util.BinaryUtil; import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; import java.util.Arrays; import java.util.Date; +import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.function.BiConsumer; import java.util.stream.Collectors; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Configurations on the Hoodie Table like type of ingestion, storage formats, hive table name etc Configurations are loaded from hoodie.properties, these properties are usually set during * initializing a path as hoodie base path and never changes during the lifetime of a hoodie table. @@ -183,8 +191,24 @@ public class HoodieTableConfig extends HoodieConfig { public static final ConfigProperty URL_ENCODE_PARTITIONING = KeyGeneratorOptions.URL_ENCODE_PARTITIONING; public static final ConfigProperty HIVE_STYLE_PARTITIONING_ENABLE = KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE; + public static final List PERSISTED_CONFIG_LIST = Arrays.asList( + Config.DATE_TIME_PARSER_PROP, + Config.INPUT_TIME_UNIT, Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, + Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, + Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, + Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, Config.DATE_TIME_PARSER_PROP + ); + public static final String NO_OP_BOOTSTRAP_INDEX_CLASS = NoOpBootstrapIndex.class.getName(); + public static final ConfigProperty TABLE_CHECKSUM = ConfigProperty + .key("hoodie.table.checksum") + .noDefaultValue() + .sinceVersion("0.11.0") + .withDocumentation("Table checksum is used to guard against partial writes in HDFS. It is added as the last entry in hoodie.properties and then used to validate while reading table config."); + + private static final String TABLE_CHECKSUM_FORMAT = "%s.%s"; // . + public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName) { super(); Path propertyPath = new Path(metaPath, HOODIE_PROPERTIES_FILE); @@ -196,6 +220,9 @@ public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName setValue(PAYLOAD_CLASS_NAME, payloadClassName); // FIXME(vc): wonder if this can be removed. Need to look into history. try (FSDataOutputStream outputStream = fs.create(propertyPath)) { + if (!isValidChecksum()) { + setValue(TABLE_CHECKSUM, String.valueOf(generateChecksum(props))); + } props.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); } } @@ -206,6 +233,10 @@ public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName "hoodie.properties file seems invalid. Please check for left over `.updated` files if any, manually copy it to hoodie.properties and retry"); } + private boolean isValidChecksum() { + return contains(TABLE_CHECKSUM) && validateChecksum(props); + } + /** * For serializing and de-serializing. */ @@ -215,13 +246,20 @@ public HoodieTableConfig() { private void fetchConfigs(FileSystem fs, String metaPath) throws IOException { Path cfgPath = new Path(metaPath, HOODIE_PROPERTIES_FILE); + Path backupCfgPath = new Path(metaPath, HOODIE_PROPERTIES_FILE_BACKUP); try (FSDataInputStream is = fs.open(cfgPath)) { props.load(is); + // validate checksum for latest table version + if (getTableVersion().versionCode() >= HoodieTableVersion.FOUR.versionCode() && !isValidChecksum()) { + LOG.warn("Checksum validation failed. Falling back to backed up configs."); + try (FSDataInputStream fsDataInputStream = fs.open(backupCfgPath)) { + props.load(fsDataInputStream); + } + } } catch (IOException ioe) { if (!fs.exists(cfgPath)) { LOG.warn("Run `table recover-configs` if config update/delete failed midway. Falling back to backed up configs."); // try the backup. this way no query ever fails if update fails midway. - Path backupCfgPath = new Path(metaPath, HOODIE_PROPERTIES_FILE_BACKUP); try (FSDataInputStream is = fs.open(backupCfgPath)) { props.load(is); } @@ -272,15 +310,31 @@ private static void modify(FileSystem fs, Path metadataFolder, Properties modify /// 2. delete the properties file, reads will go to the backup, until we are done. fs.delete(cfgPath, false); // 3. read current props, upsert and save back. + String checksum; try (FSDataInputStream in = fs.open(backupCfgPath); FSDataOutputStream out = fs.create(cfgPath, true)) { - Properties props = new Properties(); + Properties props = new TypedProperties(); props.load(in); modifyFn.accept(props, modifyProps); + if (props.containsKey(TABLE_CHECKSUM.key()) && validateChecksum(props)) { + checksum = props.getProperty(TABLE_CHECKSUM.key()); + } else { + checksum = String.valueOf(generateChecksum(props)); + props.setProperty(TABLE_CHECKSUM.key(), checksum); + } props.store(out, "Updated at " + System.currentTimeMillis()); } // 4. verify and remove backup. - // FIXME(vc): generate a hash for verification. + try (FSDataInputStream in = fs.open(cfgPath)) { + Properties props = new TypedProperties(); + props.load(in); + if (!props.containsKey(TABLE_CHECKSUM.key()) || !props.getProperty(TABLE_CHECKSUM.key()).equals(checksum)) { + // delete the properties file and throw exception indicating update failure + // subsequent writes will recover and update, reads will go to the backup until then + fs.delete(cfgPath, false); + throw new HoodieIOException("Checksum property missing or does not match."); + } + } fs.delete(backupCfgPath, false); } catch (IOException e) { throw new HoodieIOException("Error updating table configs.", e); @@ -331,10 +385,28 @@ public static void create(FileSystem fs, Path metadataFolder, Properties propert if (hoodieConfig.contains(TIMELINE_TIMEZONE)) { HoodieInstantTimeGenerator.setCommitTimeZone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getString(TIMELINE_TIMEZONE))); } + if (hoodieConfig.contains(TABLE_CHECKSUM)) { + hoodieConfig.setValue(TABLE_CHECKSUM, hoodieConfig.getString(TABLE_CHECKSUM)); + } else { + hoodieConfig.setValue(TABLE_CHECKSUM, String.valueOf(generateChecksum(hoodieConfig.getProps()))); + } hoodieConfig.getProps().store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis())); } } + public static long generateChecksum(Properties props) { + if (!props.containsKey(NAME.key())) { + throw new IllegalArgumentException(NAME.key() + " property needs to be specified"); + } + String table = props.getProperty(NAME.key()); + String database = props.getProperty(DATABASE_NAME.key(), ""); + return BinaryUtil.generateChecksum(String.format(TABLE_CHECKSUM_FORMAT, database, table).getBytes(UTF_8)); + } + + public static boolean validateChecksum(Properties props) { + return Long.parseLong(props.getProperty(TABLE_CHECKSUM.key())) == generateChecksum(props); + } + /** * Read the table type from the table properties and if not found, return the default. */ @@ -493,6 +565,13 @@ public String getUrlEncodePartitioning() { return getString(URL_ENCODE_PARTITIONING); } + /** + * Read the table checksum. + */ + private Long getTableChecksum() { + return getLong(TABLE_CHECKSUM); + } + public Map propsMap() { return props.entrySet().stream() .collect(Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue()))); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java index b9a3673960fb3..4c1eac79dc413 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java @@ -23,6 +23,8 @@ import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FailSafeConsistencyGuard; +import org.apache.hudi.common.fs.FileSystemRetryConfig; +import org.apache.hudi.common.fs.HoodieRetryWrapperFileSystem; import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.fs.NoOpConsistencyGuard; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -54,6 +56,7 @@ import java.io.Serializable; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Properties; import java.util.Set; @@ -99,12 +102,14 @@ public class HoodieTableMetaClient implements Serializable { private HoodieActiveTimeline activeTimeline; private HoodieArchivedTimeline archivedTimeline; private ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build(); + private FileSystemRetryConfig fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().build(); private HoodieTableMetaClient(Configuration conf, String basePath, boolean loadActiveTimelineOnLoad, ConsistencyGuardConfig consistencyGuardConfig, Option layoutVersion, - String payloadClassName) { + String payloadClassName, FileSystemRetryConfig fileSystemRetryConfig) { LOG.info("Loading HoodieTableMetaClient from " + basePath); this.consistencyGuardConfig = consistencyGuardConfig; + this.fileSystemRetryConfig = fileSystemRetryConfig; this.hadoopConf = new SerializableConfiguration(conf); Path basePathDir = new Path(basePath); this.basePath = basePathDir.toString(); @@ -140,7 +145,8 @@ public HoodieTableMetaClient() {} public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) { return HoodieTableMetaClient.builder().setConf(oldMetaClient.hadoopConf.get()).setBasePath(oldMetaClient.basePath).setLoadActiveTimelineOnLoad(oldMetaClient.loadActiveTimelineOnLoad) - .setConsistencyGuardConfig(oldMetaClient.consistencyGuardConfig).setLayoutVersion(Option.of(oldMetaClient.timelineLayoutVersion)).setPayloadClassName(null).build(); + .setConsistencyGuardConfig(oldMetaClient.consistencyGuardConfig).setLayoutVersion(Option.of(oldMetaClient.timelineLayoutVersion)).setPayloadClassName(null) + .setFileSystemRetryConfig(oldMetaClient.fileSystemRetryConfig).build(); } /** @@ -150,7 +156,7 @@ public static HoodieTableMetaClient reload(HoodieTableMetaClient oldMetaClient) */ private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); - fs = null; // will be lazily inited + fs = null; // will be lazily initialized } private void writeObject(java.io.ObjectOutputStream out) throws IOException { @@ -255,6 +261,14 @@ public TimelineLayoutVersion getTimelineLayoutVersion() { public HoodieWrapperFileSystem getFs() { if (fs == null) { FileSystem fileSystem = FSUtils.getFs(metaPath, hadoopConf.newCopy()); + + if (fileSystemRetryConfig.isFileSystemActionRetryEnable()) { + fileSystem = new HoodieRetryWrapperFileSystem(fileSystem, + fileSystemRetryConfig.getMaxRetryIntervalMs(), + fileSystemRetryConfig.getMaxRetryNumbers(), + fileSystemRetryConfig.getInitialRetryIntervalMs(), + fileSystemRetryConfig.getRetryExceptions()); + } ValidationUtils.checkArgument(!(fileSystem instanceof HoodieWrapperFileSystem), "File System not expected to be that of HoodieWrapperFileSystem"); fs = new HoodieWrapperFileSystem(fileSystem, @@ -265,6 +279,10 @@ public HoodieWrapperFileSystem getFs() { return fs; } + public void setFs(HoodieWrapperFileSystem fs) { + this.fs = fs; + } + /** * Return raw file-system. * @@ -304,11 +322,15 @@ public ConsistencyGuardConfig getConsistencyGuardConfig() { return consistencyGuardConfig; } + public FileSystemRetryConfig getFileSystemRetryConfig() { + return fileSystemRetryConfig; + } + /** * Get the archived commits as a timeline. This is costly operation, as all data from the archived files are read. * This should not be used, unless for historical debugging purposes. * - * @return Active commit timeline + * @return Archived commit timeline */ public synchronized HoodieArchivedTimeline getArchivedTimeline() { if (archivedTimeline == null) { @@ -317,6 +339,20 @@ public synchronized HoodieArchivedTimeline getArchivedTimeline() { return archivedTimeline; } + /** + * Returns fresh new archived commits as a timeline from startTs (inclusive). + * + *

This is costly operation if really early endTs is specified. + * Be caution to use this only when the time range is short. + * + *

This method is not thread safe. + * + * @return Archived commit timeline + */ + public HoodieArchivedTimeline getArchivedTimeline(String startTs) { + return new HoodieArchivedTimeline(this, startTs); + } + /** * Validate table properties. * @param properties Properties from writeConfig. @@ -577,6 +613,7 @@ public static class Builder { private boolean loadActiveTimelineOnLoad = false; private String payloadClassName = null; private ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build(); + private FileSystemRetryConfig fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().build(); private Option layoutVersion = Option.of(TimelineLayoutVersion.CURR_LAYOUT_VERSION); public Builder setConf(Configuration conf) { @@ -604,6 +641,11 @@ public Builder setConsistencyGuardConfig(ConsistencyGuardConfig consistencyGuard return this; } + public Builder setFileSystemRetryConfig(FileSystemRetryConfig fileSystemRetryConfig) { + this.fileSystemRetryConfig = fileSystemRetryConfig; + return this; + } + public Builder setLayoutVersion(Option layoutVersion) { this.layoutVersion = layoutVersion; return this; @@ -613,7 +655,7 @@ public HoodieTableMetaClient build() { ValidationUtils.checkArgument(conf != null, "Configuration needs to be set to init HoodieTableMetaClient"); ValidationUtils.checkArgument(basePath != null, "basePath needs to be set to init HoodieTableMetaClient"); return new HoodieTableMetaClient(conf, basePath, - loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, payloadClassName); + loadActiveTimelineOnLoad, consistencyGuardConfig, layoutVersion, payloadClassName, fileSystemRetryConfig); } } @@ -643,6 +685,12 @@ public static class PropertyBuilder { private Boolean urlEncodePartitioning; private HoodieTimelineTimeZone commitTimeZone; + /** + * Persist the configs that is written at the first time, and should not be changed. + * Like KeyGenerator's configs. + */ + private Properties others = new Properties(); + private PropertyBuilder() { } @@ -750,6 +798,23 @@ public PropertyBuilder setCommitTimezone(HoodieTimelineTimeZone timelineTimeZone return this; } + public PropertyBuilder set(String key, Object value) { + if (HoodieTableConfig.PERSISTED_CONFIG_LIST.contains(key)) { + this.others.put(key, value); + } + return this; + } + + public PropertyBuilder set(Map props) { + for (String key: HoodieTableConfig.PERSISTED_CONFIG_LIST) { + Object value = props.get(key); + if (value != null) { + set(key, value); + } + } + return this; + } + public PropertyBuilder fromMetaClient(HoodieTableMetaClient metaClient) { return setTableType(metaClient.getTableType()) .setTableName(metaClient.getTableConfig().getTableName()) @@ -759,6 +824,14 @@ public PropertyBuilder fromMetaClient(HoodieTableMetaClient metaClient) { public PropertyBuilder fromProperties(Properties properties) { HoodieConfig hoodieConfig = new HoodieConfig(properties); + + for (String key: HoodieTableConfig.PERSISTED_CONFIG_LIST) { + Object value = hoodieConfig.getString(key); + if (value != null) { + set(key, value); + } + } + if (hoodieConfig.contains(HoodieTableConfig.DATABASE_NAME)) { setDatabaseName(hoodieConfig.getString(HoodieTableConfig.DATABASE_NAME)); } @@ -828,6 +901,9 @@ public Properties build() { ValidationUtils.checkArgument(tableName != null, "tableName is null"); HoodieTableConfig tableConfig = new HoodieTableConfig(); + + tableConfig.setAll(others); + if (databaseName != null) { tableConfig.setValue(HoodieTableConfig.DATABASE_NAME, databaseName); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java index 122c387756e88..3a249689ad2b3 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableVersion.java @@ -34,7 +34,9 @@ public enum HoodieTableVersion { // 0.9.0 onwards TWO(2), // 0.10.0 onwards - THREE(3); + THREE(3), + // 0.11.0 onwards + FOUR(4); private final int versionCode; @@ -47,7 +49,7 @@ public int versionCode() { } public static HoodieTableVersion current() { - return THREE; + return FOUR; } public static HoodieTableVersion versionFromCode(int versionCode) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java index a70774896ceb7..a84a9482a6707 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java @@ -21,14 +21,15 @@ import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.SchemaCompatibility; - +import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; - +import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Reader; import org.apache.hudi.common.table.log.block.HoodieDataBlock; @@ -42,10 +43,11 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.InvalidTableException; +import org.apache.hudi.io.storage.HoodieHFileReader; +import org.apache.hudi.io.storage.HoodieOrcReader; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; - import org.apache.parquet.avro.AvroSchemaConverter; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.ParquetFileReader; @@ -61,15 +63,11 @@ public class TableSchemaResolver { private static final Logger LOG = LogManager.getLogger(TableSchemaResolver.class); private final HoodieTableMetaClient metaClient; - private final boolean withOperationField; + private final boolean hasOperationField; public TableSchemaResolver(HoodieTableMetaClient metaClient) { - this(metaClient, false); - } - - public TableSchemaResolver(HoodieTableMetaClient metaClient, boolean withOperationField) { this.metaClient = metaClient; - this.withOperationField = withOperationField; + this.hasOperationField = hasOperationField(); } /** @@ -85,29 +83,26 @@ private MessageType getTableParquetSchemaFromDataFile() { try { switch (metaClient.getTableType()) { case COPY_ON_WRITE: - // For COW table, the file has data written must be in parquet format currently. + // For COW table, the file has data written must be in parquet or orc format currently. if (instantAndCommitMetadata.isPresent()) { HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight(); String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get(); - return readSchemaFromBaseFile(new Path(filePath)); + return readSchemaFromBaseFile(filePath); } else { throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath()); } case MERGE_ON_READ: - // For MOR table, the file has data written may be a parquet file or .log file. + // For MOR table, the file has data written may be a parquet file, .log file, orc file or hfile. // Determine the file format based on the file name, and then extract schema from it. if (instantAndCommitMetadata.isPresent()) { HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight(); String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get(); - if (filePath.contains(HoodieLogFile.DELTA_EXTENSION)) { + if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) { // this is a log file return readSchemaFromLogFile(new Path(filePath)); - } else if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) { - // this is a parquet file - return readSchemaFromBaseFile(new Path(filePath)); } else { - throw new IllegalArgumentException("Unknown file format :" + filePath); + return readSchemaFromBaseFile(filePath); } } else { throw new IllegalArgumentException("Could not find any data file written for commit, " @@ -122,7 +117,22 @@ private MessageType getTableParquetSchemaFromDataFile() { } } - public Schema getTableAvroSchemaFromDataFile() throws Exception { + private MessageType readSchemaFromBaseFile(String filePath) throws IOException { + if (filePath.contains(HoodieFileFormat.PARQUET.getFileExtension())) { + // this is a parquet file + return readSchemaFromParquetBaseFile(new Path(filePath)); + } else if (filePath.contains(HoodieFileFormat.HFILE.getFileExtension())) { + // this is a HFile + return readSchemaFromHFileBaseFile(new Path(filePath)); + } else if (filePath.contains(HoodieFileFormat.ORC.getFileExtension())) { + // this is a ORC file + return readSchemaFromORCBaseFile(new Path(filePath)); + } else { + throw new IllegalArgumentException("Unknown base file format :" + filePath); + } + } + + public Schema getTableAvroSchemaFromDataFile() { return convertParquetSchemaToAvro(getTableParquetSchemaFromDataFile()); } @@ -133,7 +143,7 @@ public Schema getTableAvroSchemaFromDataFile() throws Exception { * @throws Exception */ public Schema getTableAvroSchema() throws Exception { - return getTableAvroSchema(true); + return getTableAvroSchema(metaClient.getTableConfig().populateMetaFields()); } /** @@ -151,7 +161,7 @@ public Schema getTableAvroSchema(boolean includeMetadataFields) throws Exception Option schemaFromTableConfig = metaClient.getTableConfig().getTableCreateSchema(); if (schemaFromTableConfig.isPresent()) { if (includeMetadataFields) { - return HoodieAvroUtils.addMetadataFields(schemaFromTableConfig.get(), withOperationField); + return HoodieAvroUtils.addMetadataFields(schemaFromTableConfig.get(), hasOperationField); } else { return schemaFromTableConfig.get(); } @@ -176,7 +186,7 @@ public MessageType getTableParquetSchema() throws Exception { } Option schemaFromTableConfig = metaClient.getTableConfig().getTableCreateSchema(); if (schemaFromTableConfig.isPresent()) { - Schema schema = HoodieAvroUtils.addMetadataFields(schemaFromTableConfig.get(), withOperationField); + Schema schema = HoodieAvroUtils.addMetadataFields(schemaFromTableConfig.get(), hasOperationField); return convertAvroSchemaToParquet(schema); } return getTableParquetSchemaFromDataFile(); @@ -212,14 +222,21 @@ public Schema getTableAvroSchemaWithoutMetadataFields(HoodieInstant instant) thr } /** - * Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the last commit. + * Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the last commit with valid schema. * * @return Avro schema for this table */ private Option getTableSchemaFromCommitMetadata(boolean includeMetadataFields) { - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - if (timeline.lastInstant().isPresent()) { - return getTableSchemaFromCommitMetadata(timeline.lastInstant().get(), includeMetadataFields); + Option> instantAndCommitMetadata = + metaClient.getActiveTimeline().getLastCommitMetadataWithValidSchema(); + if (instantAndCommitMetadata.isPresent()) { + HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight(); + String schemaStr = commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY); + Schema schema = new Schema.Parser().parse(schemaStr); + if (includeMetadataFields) { + schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField); + } + return Option.of(schema); } else { return Option.empty(); } @@ -244,7 +261,7 @@ private Option getTableSchemaFromCommitMetadata(HoodieInstant instant, b Schema schema = new Schema.Parser().parse(existingSchemaStr); if (includeMetadataFields) { - schema = HoodieAvroUtils.addMetadataFields(schema, withOperationField); + schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField); } return Option.of(schema); } catch (Exception e) { @@ -416,19 +433,41 @@ public Option getLatestCommitMetadata() { /** * Read the parquet schema from a parquet File. */ - public MessageType readSchemaFromBaseFile(Path parquetFilePath) throws IOException { + public MessageType readSchemaFromParquetBaseFile(Path parquetFilePath) throws IOException { LOG.info("Reading schema from " + parquetFilePath); FileSystem fs = metaClient.getRawFs(); - if (!fs.exists(parquetFilePath)) { - throw new IllegalArgumentException( - "Failed to read schema from data file " + parquetFilePath + ". File does not exist."); - } ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER); return fileFooter.getFileMetaData().getSchema(); } + /** + * Read the parquet schema from a HFile. + */ + public MessageType readSchemaFromHFileBaseFile(Path hFilePath) throws IOException { + LOG.info("Reading schema from " + hFilePath); + + FileSystem fs = metaClient.getRawFs(); + CacheConfig cacheConfig = new CacheConfig(fs.getConf()); + HoodieHFileReader hFileReader = new HoodieHFileReader<>(fs.getConf(), hFilePath, cacheConfig); + + return convertAvroSchemaToParquet(hFileReader.getSchema()); + } + + + /** + * Read the parquet schema from a ORC file. + */ + public MessageType readSchemaFromORCBaseFile(Path orcFilePath) throws IOException { + LOG.info("Reading schema from " + orcFilePath); + + FileSystem fs = metaClient.getRawFs(); + HoodieOrcReader orcReader = new HoodieOrcReader<>(fs.getConf(), orcFilePath); + + return convertAvroSchemaToParquet(orcReader.getSchema()); + } + /** * Read schema from a data file from the last compaction commit done. * @throws Exception @@ -445,7 +484,7 @@ public MessageType readSchemaFromLastCompaction(Option lastCompac String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny() .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction " + lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath())); - return readSchemaFromBaseFile(new Path(filePath)); + return readSchemaFromBaseFile(filePath); } /** @@ -477,4 +516,18 @@ public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws } return null; } + + public boolean isHasOperationField() { + return hasOperationField; + } + + private boolean hasOperationField() { + try { + Schema tableAvroSchema = getTableAvroSchemaFromDataFile(); + return tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null; + } catch (Exception e) { + LOG.info(String.format("Failed to read operation field from avro schema (%s)", e.getMessage())); + return false; + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java index d495badeca4eb..fa5117e41fa76 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.table.log; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; @@ -30,7 +31,9 @@ import org.apache.hudi.common.table.log.block.HoodieDeleteBlock; import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.ValidationUtils; @@ -48,8 +51,8 @@ import java.io.IOException; import java.util.ArrayDeque; -import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Deque; import java.util.HashSet; import java.util.List; @@ -174,11 +177,11 @@ protected String getKeyField() { return this.simpleKeyGenFields.get().getKey(); } - public void scan() { + public synchronized void scan() { scan(Option.empty()); } - public void scan(Option> keys) { + public synchronized void scan(Option> keys) { currentInstantLogBlocks = new ArrayDeque<>(); progress = 0.0f; totalLogFiles = new AtomicLong(0); @@ -230,6 +233,7 @@ public void scan(Option> keys) { switch (logBlock.getBlockType()) { case HFILE_DATA_BLOCK: case AVRO_DATA_BLOCK: + case PARQUET_DATA_BLOCK: LOG.info("Reading a data block from file " + logFile.getPath() + " at instant " + logBlock.getLogBlockHeader().get(INSTANT_TIME)); if (isNewInstantBlock(logBlock) && !readBlocksLazily) { @@ -356,17 +360,13 @@ private boolean isNewInstantBlock(HoodieLogBlock logBlock) { * handle it. */ private void processDataBlock(HoodieDataBlock dataBlock, Option> keys) throws Exception { - // TODO (NA) - Implement getRecordItr() in HoodieAvroDataBlock and use that here - List recs = new ArrayList<>(); - if (!keys.isPresent()) { - recs = dataBlock.getRecords(); - } else { - recs = dataBlock.getRecords(keys.get()); - } - totalLogRecords.addAndGet(recs.size()); - for (IndexedRecord rec : recs) { - processNextRecord(createHoodieRecord(rec, this.hoodieTableMetaClient.getTableConfig(), this.payloadClassFQN, - this.preCombineField, this.withOperationField, this.simpleKeyGenFields, this.partitionName)); + try (ClosableIterator recordItr = dataBlock.getRecordItr(keys.orElse(Collections.emptyList()))) { + while (recordItr.hasNext()) { + IndexedRecord record = recordItr.next(); + processNextRecord(createHoodieRecord(record, this.hoodieTableMetaClient.getTableConfig(), this.payloadClassFQN, + this.preCombineField, this.withOperationField, this.simpleKeyGenFields, this.partitionName)); + totalLogRecords.incrementAndGet(); + } } } @@ -382,7 +382,7 @@ private void processDataBlock(HoodieDataBlock dataBlock, Option> ke * @param partitionName - Partition name * @return HoodieRecord created from the IndexedRecord */ - protected HoodieRecord createHoodieRecord(final IndexedRecord rec, final HoodieTableConfig hoodieTableConfig, + protected HoodieAvroRecord createHoodieRecord(final IndexedRecord rec, final HoodieTableConfig hoodieTableConfig, final String payloadClassFQN, final String preCombineField, final boolean withOperationField, final Option> simpleKeyGenFields, @@ -426,6 +426,9 @@ private void processQueuedBlocksForInstant(Deque logBlocks, int case HFILE_DATA_BLOCK: processDataBlock((HoodieHFileDataBlock) lastBlock, keys); break; + case PARQUET_DATA_BLOCK: + processDataBlock((HoodieParquetDataBlock) lastBlock, keys); + break; case DELETE_BLOCK: Arrays.stream(((HoodieDeleteBlock) lastBlock).getKeysToDelete()).forEach(this::processNextDeletedKey); break; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java index e6ead54a48d77..07cb36bb169bb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java @@ -31,13 +31,14 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.exception.CorruptedLogFileException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieNotSupportedException; import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BufferedFSInputStream; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSInputStream; @@ -46,6 +47,8 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import javax.annotation.Nullable; + import java.io.EOFException; import java.io.IOException; import java.util.Arrays; @@ -53,6 +56,9 @@ import java.util.Map; import java.util.Objects; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.common.util.ValidationUtils.checkState; + /** * Scans a log file and provides block level iterator on the log file Loads the entire block contents in memory Can emit * either a DataBlock, CommandBlock, DeleteBlock or CorruptBlock (if one is found). @@ -63,6 +69,7 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader { private static final int BLOCK_SCAN_READ_BUFFER_SIZE = 1024 * 1024; // 1 MB private static final Logger LOG = LogManager.getLogger(HoodieLogFileReader.class); + private final Configuration hadoopConf; private final FSDataInputStream inputStream; private final HoodieLogFile logFile; private final byte[] magicBuffer = new byte[6]; @@ -72,7 +79,7 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader { private long reverseLogFilePosition; private long lastReverseLogFilePosition; private boolean reverseReader; - private boolean enableInlineReading; + private boolean enableRecordLookups; private boolean closed = false; private transient Thread shutdownThread = null; @@ -88,74 +95,24 @@ public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSc } public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize, - boolean readBlockLazily, boolean reverseReader, boolean enableInlineReading, + boolean readBlockLazily, boolean reverseReader, boolean enableRecordLookups, String keyField) throws IOException { - FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize); - this.logFile = logFile; - this.inputStream = getFSDataInputStream(fsDataInputStream, fs, bufferSize); + this.hadoopConf = fs.getConf(); + // NOTE: We repackage {@code HoodieLogFile} here to make sure that the provided path + // is prefixed with an appropriate scheme given that we're not propagating the FS + // further + this.logFile = new HoodieLogFile(FSUtils.makeQualified(fs, logFile.getPath()), logFile.getFileSize()); + this.inputStream = getFSDataInputStream(fs, this.logFile, bufferSize); this.readerSchema = readerSchema; this.readBlockLazily = readBlockLazily; this.reverseReader = reverseReader; - this.enableInlineReading = enableInlineReading; + this.enableRecordLookups = enableRecordLookups; this.keyField = keyField; if (this.reverseReader) { - this.reverseLogFilePosition = this.lastReverseLogFilePosition = logFile.getFileSize(); + this.reverseLogFilePosition = this.lastReverseLogFilePosition = this.logFile.getFileSize(); } - addShutDownHook(); - } - - public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException { - this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false); - } - /** - * Fetch the right {@link FSDataInputStream} to be used by wrapping with required input streams. - * @param fsDataInputStream original instance of {@link FSDataInputStream}. - * @param fs instance of {@link FileSystem} in use. - * @param bufferSize buffer size to be used. - * @return the right {@link FSDataInputStream} as required. - */ - private FSDataInputStream getFSDataInputStream(FSDataInputStream fsDataInputStream, FileSystem fs, int bufferSize) { - if (FSUtils.isGCSFileSystem(fs)) { - // in GCS FS, we might need to interceptor seek offsets as we might get EOF exception - return new SchemeAwareFSDataInputStream(getFSDataInputStreamForGCS(fsDataInputStream, bufferSize), true); - } - - if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { - return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( - new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); - } - - // fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream - // need to wrap in another BufferedFSInputStream the make bufferSize work? - return fsDataInputStream; - } - - /** - * GCS FileSystem needs some special handling for seek and hence this method assists to fetch the right {@link FSDataInputStream} to be - * used by wrapping with required input streams. - * @param fsDataInputStream original instance of {@link FSDataInputStream}. - * @param bufferSize buffer size to be used. - * @return the right {@link FSDataInputStream} as required. - */ - private FSDataInputStream getFSDataInputStreamForGCS(FSDataInputStream fsDataInputStream, int bufferSize) { - // incase of GCS FS, there are two flows. - // a. fsDataInputStream.getWrappedStream() instanceof FSInputStream - // b. fsDataInputStream.getWrappedStream() not an instanceof FSInputStream, but an instance of FSDataInputStream. - // (a) is handled in the first if block and (b) is handled in the second if block. If not, we fallback to original fsDataInputStream - if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { - return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( - new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); - } - - if (fsDataInputStream.getWrappedStream() instanceof FSDataInputStream - && ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream() instanceof FSInputStream) { - FSInputStream inputStream = (FSInputStream)((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream(); - return new TimedFSDataInputStream(logFile.getPath(), - new FSDataInputStream(new BufferedFSInputStream(inputStream, bufferSize))); - } - - return fsDataInputStream; + addShutDownHook(); } @Override @@ -181,15 +138,10 @@ private void addShutDownHook() { // TODO : convert content and block length to long by using ByteBuffer, raw byte [] allows // for max of Integer size private HoodieLogBlock readBlock() throws IOException { - - int blocksize; - int type; - HoodieLogBlockType blockType = null; - Map header = null; - + int blockSize; try { // 1 Read the total size of the block - blocksize = (int) inputStream.readLong(); + blockSize = (int) inputStream.readLong(); } catch (EOFException | CorruptedLogFileException e) { // An exception reading any of the above indicates a corrupt block // Create a corrupt block by finding the next MAGIC marker or EOF @@ -197,9 +149,9 @@ private HoodieLogBlock readBlock() throws IOException { } // We may have had a crash which could have written this block partially - // Skip blocksize in the stream and we should either find a sync marker (start of the next + // Skip blockSize in the stream and we should either find a sync marker (start of the next // block) or EOF. If we did not find either of it, then this block is a corrupted block. - boolean isCorrupted = isBlockCorrupt(blocksize); + boolean isCorrupted = isBlockCorrupted(blockSize); if (isCorrupted) { return createCorruptBlock(); } @@ -208,71 +160,85 @@ private HoodieLogBlock readBlock() throws IOException { HoodieLogFormat.LogFormatVersion nextBlockVersion = readVersion(); // 3. Read the block type for a log block - if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) { - type = inputStream.readInt(); - - ValidationUtils.checkArgument(type < HoodieLogBlockType.values().length, "Invalid block byte type found " + type); - blockType = HoodieLogBlockType.values()[type]; - } + HoodieLogBlockType blockType = tryReadBlockType(nextBlockVersion); // 4. Read the header for a log block, if present - if (nextBlockVersion.hasHeader()) { - header = HoodieLogBlock.getLogMetadata(inputStream); - } - int contentLength = blocksize; + Map header = + nextBlockVersion.hasHeader() ? HoodieLogBlock.getLogMetadata(inputStream) : null; + // 5. Read the content length for the content - if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) { - contentLength = (int) inputStream.readLong(); - } + // Fallback to full-block size if no content-length + // TODO replace w/ hasContentLength + int contentLength = + nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION ? (int) inputStream.readLong() : blockSize; // 6. Read the content or skip content based on IO vs Memory trade-off by client - // TODO - have a max block size and reuse this buffer in the ByteBuffer - // (hard to guess max block size for now) long contentPosition = inputStream.getPos(); - byte[] content = HoodieLogBlock.readOrSkipContent(inputStream, contentLength, readBlockLazily); + boolean shouldReadLazily = readBlockLazily && nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION; + Option content = HoodieLogBlock.tryReadContent(inputStream, contentLength, shouldReadLazily); // 7. Read footer if any - Map footer = null; - if (nextBlockVersion.hasFooter()) { - footer = HoodieLogBlock.getLogMetadata(inputStream); - } + Map footer = + nextBlockVersion.hasFooter() ? HoodieLogBlock.getLogMetadata(inputStream) : null; // 8. Read log block length, if present. This acts as a reverse pointer when traversing a // log file in reverse - @SuppressWarnings("unused") - long logBlockLength = 0; if (nextBlockVersion.hasLogBlockLength()) { - logBlockLength = inputStream.readLong(); + inputStream.readLong(); } // 9. Read the log block end position in the log file long blockEndPos = inputStream.getPos(); + HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = + new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, contentLength, blockEndPos); + switch (Objects.requireNonNull(blockType)) { - // based on type read the block case AVRO_DATA_BLOCK: if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) { - return HoodieAvroDataBlock.getBlock(content, readerSchema); + return HoodieAvroDataBlock.getBlock(content.get(), readerSchema); } else { - return new HoodieAvroDataBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, readerSchema, header, footer, keyField); + return new HoodieAvroDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, + Option.ofNullable(readerSchema), header, footer, keyField); } + case HFILE_DATA_BLOCK: - return new HoodieHFileDataBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, readerSchema, - header, footer, enableInlineReading, keyField); + checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, + String.format("HFile block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION)); + + return new HoodieHFileDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, + Option.ofNullable(readerSchema), header, footer, enableRecordLookups); + + case PARQUET_DATA_BLOCK: + checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, + String.format("Parquet block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION)); + + return new HoodieParquetDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, + Option.ofNullable(readerSchema), header, footer, keyField); + case DELETE_BLOCK: - return HoodieDeleteBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, header, footer); + return new HoodieDeleteBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer); + case COMMAND_BLOCK: - return HoodieCommandBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily, - contentPosition, contentLength, blockEndPos, header, footer); + return new HoodieCommandBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer); + default: throw new HoodieNotSupportedException("Unsupported Block " + blockType); } } + @Nullable + private HoodieLogBlockType tryReadBlockType(HoodieLogFormat.LogFormatVersion blockVersion) throws IOException { + if (blockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) { + return null; + } + + int type = inputStream.readInt(); + checkArgument(type < HoodieLogBlockType.values().length, "Invalid block byte type found " + type); + return HoodieLogBlockType.values()[type]; + } + private HoodieLogBlock createCorruptBlock() throws IOException { LOG.info("Log " + logFile + " has a corrupted block at " + inputStream.getPos()); long currentPos = inputStream.getPos(); @@ -282,15 +248,25 @@ private HoodieLogBlock createCorruptBlock() throws IOException { LOG.info("Next available block in " + logFile + " starts at " + nextBlockOffset); int corruptedBlockSize = (int) (nextBlockOffset - currentPos); long contentPosition = inputStream.getPos(); - byte[] corruptedBytes = HoodieLogBlock.readOrSkipContent(inputStream, corruptedBlockSize, readBlockLazily); - return HoodieCorruptBlock.getBlock(logFile, inputStream, Option.ofNullable(corruptedBytes), readBlockLazily, - contentPosition, corruptedBlockSize, nextBlockOffset, new HashMap<>(), new HashMap<>()); + Option corruptedBytes = HoodieLogBlock.tryReadContent(inputStream, corruptedBlockSize, readBlockLazily); + HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = + new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, corruptedBlockSize, nextBlockOffset); + return new HoodieCorruptBlock(corruptedBytes, inputStream, readBlockLazily, Option.of(logBlockContentLoc), new HashMap<>(), new HashMap<>()); } - private boolean isBlockCorrupt(int blocksize) throws IOException { + private boolean isBlockCorrupted(int blocksize) throws IOException { long currentPos = inputStream.getPos(); + long blockSizeFromFooter; + try { - inputStream.seek(currentPos + blocksize); + // check if the blocksize mentioned in the footer is the same as the header; + // by seeking and checking the length of a long. We do not seek `currentPos + blocksize` + // which can be the file size for the last block in the file, causing EOFException + // for some FSDataInputStream implementation + inputStream.seek(currentPos + blocksize - Long.BYTES); + // Block size in the footer includes the magic header, which the header does not include. + // So we have to shorten the footer block size by the size of magic hash + blockSizeFromFooter = inputStream.readLong() - magicBuffer.length; } catch (EOFException e) { LOG.info("Found corrupted block in file " + logFile + " with block size(" + blocksize + ") running past EOF"); // this is corrupt @@ -301,19 +277,13 @@ private boolean isBlockCorrupt(int blocksize) throws IOException { return true; } - // check if the blocksize mentioned in the footer is the same as the header; by seeking back the length of a long - // the backward seek does not incur additional IO as {@link org.apache.hadoop.hdfs.DFSInputStream#seek()} - // only moves the index. actual IO happens on the next read operation - inputStream.seek(inputStream.getPos() - Long.BYTES); - // Block size in the footer includes the magic header, which the header does not include. - // So we have to shorten the footer block size by the size of magic hash - long blockSizeFromFooter = inputStream.readLong() - magicBuffer.length; if (blocksize != blockSizeFromFooter) { LOG.info("Found corrupted block in file " + logFile + ". Header block size(" + blocksize - + ") did not match the footer block size(" + blockSizeFromFooter + ")"); + + ") did not match the footer block size(" + blockSizeFromFooter + ")"); inputStream.seek(currentPos); return true; } + try { readMagic(); // all good - either we found the sync marker or EOF. Reset position and continue @@ -481,4 +451,59 @@ public long moveToPrev() throws IOException { public void remove() { throw new UnsupportedOperationException("Remove not supported for HoodieLogFileReader"); } + + /** + * Fetch the right {@link FSDataInputStream} to be used by wrapping with required input streams. + * @param fs instance of {@link FileSystem} in use. + * @param bufferSize buffer size to be used. + * @return the right {@link FSDataInputStream} as required. + */ + private static FSDataInputStream getFSDataInputStream(FileSystem fs, + HoodieLogFile logFile, + int bufferSize) throws IOException { + FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize); + + if (FSUtils.isGCSFileSystem(fs)) { + // in GCS FS, we might need to interceptor seek offsets as we might get EOF exception + return new SchemeAwareFSDataInputStream(getFSDataInputStreamForGCS(fsDataInputStream, logFile, bufferSize), true); + } + + if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { + return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( + new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); + } + + // fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream + // need to wrap in another BufferedFSInputStream the make bufferSize work? + return fsDataInputStream; + } + + /** + * GCS FileSystem needs some special handling for seek and hence this method assists to fetch the right {@link FSDataInputStream} to be + * used by wrapping with required input streams. + * @param fsDataInputStream original instance of {@link FSDataInputStream}. + * @param bufferSize buffer size to be used. + * @return the right {@link FSDataInputStream} as required. + */ + private static FSDataInputStream getFSDataInputStreamForGCS(FSDataInputStream fsDataInputStream, + HoodieLogFile logFile, + int bufferSize) { + // incase of GCS FS, there are two flows. + // a. fsDataInputStream.getWrappedStream() instanceof FSInputStream + // b. fsDataInputStream.getWrappedStream() not an instanceof FSInputStream, but an instance of FSDataInputStream. + // (a) is handled in the first if block and (b) is handled in the second if block. If not, we fallback to original fsDataInputStream + if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) { + return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream( + new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize))); + } + + if (fsDataInputStream.getWrappedStream() instanceof FSDataInputStream + && ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream() instanceof FSInputStream) { + FSInputStream inputStream = (FSInputStream)((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream(); + return new TimedFSDataInputStream(logFile.getPath(), + new FSDataInputStream(new BufferedFSInputStream(inputStream, bufferSize))); + } + + return fsDataInputStream; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java index 1c33b81246c58..8dbe85efd1164 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFormatWriter.java @@ -60,13 +60,6 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer { private static final String APPEND_UNAVAILABLE_EXCEPTION_MESSAGE = "not sufficiently replicated yet"; - /** - * @param fs - * @param logFile - * @param bufferSize - * @param replication - * @param sizeThreshold - */ HoodieLogFormatWriter(FileSystem fs, HoodieLogFile logFile, Integer bufferSize, Short replication, Long sizeThreshold, String rolloverLogWriteToken) { this.fs = fs; this.logFile = logFile; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java index 2e47e695d3144..d0ab73ab01552 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieMergedLogRecordScanner.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.table.log; import org.apache.hudi.common.config.HoodieCommonConfig; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; @@ -144,7 +145,7 @@ protected void processNextRecord(HoodieRecord hoo HoodieRecordPayload combinedValue = hoodieRecord.getData().preCombine(oldValue); boolean choosePrev = combinedValue.equals(oldValue); HoodieOperation operation = choosePrev ? oldRecord.getOperation() : hoodieRecord.getOperation(); - records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue, operation)); + records.put(key, new HoodieAvroRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue, operation)); } else { // Put the record as is records.put(key, hoodieRecord); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java index 1d3f5f3b01c56..e7f183fafcdd4 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieAvroDataBlock.java @@ -18,13 +18,6 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.fs.SizeAwareDataInputStream; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieIOException; - import org.apache.avro.Schema; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; @@ -36,59 +29,63 @@ import org.apache.avro.io.Encoder; import org.apache.avro.io.EncoderFactory; import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hudi.common.fs.SizeAwareDataInputStream; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieIOException; +import javax.annotation.Nonnull; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.zip.DeflaterOutputStream; +import java.util.zip.InflaterInputStream; -import javax.annotation.Nonnull; +import static org.apache.hudi.common.util.ValidationUtils.checkState; /** * HoodieAvroDataBlock contains a list of records serialized using Avro. It is used with the Parquet base file format. */ public class HoodieAvroDataBlock extends HoodieDataBlock { - private ThreadLocal encoderCache = new ThreadLocal<>(); - private ThreadLocal decoderCache = new ThreadLocal<>(); - - public HoodieAvroDataBlock(@Nonnull Map logBlockHeader, - @Nonnull Map logBlockFooter, - @Nonnull Option blockContentLocation, @Nonnull Option content, - FSDataInputStream inputStream, boolean readBlockLazily) { - super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily); + private final ThreadLocal encoderCache = new ThreadLocal<>(); + + public HoodieAvroDataBlock(FSDataInputStream inputStream, + Option content, + boolean readBlockLazily, + HoodieLogBlockContentLocation logBlockContentLocation, + Option readerSchema, + Map header, + Map footer, + String keyField) { + super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, keyField, false); } - public HoodieAvroDataBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, - boolean readBlockLazily, long position, long blockSize, long blockEndpos, Schema readerSchema, - Map header, Map footer, String keyField) { - super(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), readerSchema, header, - footer, keyField); - } - - public HoodieAvroDataBlock(@Nonnull List records, @Nonnull Map header, String keyField) { + public HoodieAvroDataBlock(@Nonnull List records, + @Nonnull Map header, + @Nonnull String keyField) { super(records, header, new HashMap<>(), keyField); } - public HoodieAvroDataBlock(@Nonnull List records, @Nonnull Map header) { - super(records, header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD); - } - @Override public HoodieLogBlockType getBlockType() { return HoodieLogBlockType.AVRO_DATA_BLOCK; } @Override - protected byte[] serializeRecords() throws IOException { + protected byte[] serializeRecords(List records) throws IOException { Schema schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); GenericDatumWriter writer = new GenericDatumWriter<>(schema); ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -101,9 +98,7 @@ protected byte[] serializeRecords() throws IOException { output.writeInt(records.size()); // 3. Write the records - Iterator itr = records.iterator(); - while (itr.hasNext()) { - IndexedRecord s = itr.next(); + for (IndexedRecord s : records) { ByteArrayOutputStream temp = new ByteArrayOutputStream(); BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(temp, encoderCache.get()); encoderCache.set(encoder); @@ -118,56 +113,84 @@ protected byte[] serializeRecords() throws IOException { output.writeInt(size); // Write the content output.write(temp.toByteArray()); - itr.remove(); } catch (IOException e) { throw new HoodieIOException("IOException converting HoodieAvroDataBlock to bytes", e); } } + encoderCache.remove(); output.close(); return baos.toByteArray(); } // TODO (na) - Break down content into smaller chunks of byte [] to be GC as they are used - // TODO (na) - Implement a recordItr instead of recordList @Override - protected void deserializeRecords() throws IOException { - SizeAwareDataInputStream dis = - new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(getContent().get()))); + protected ClosableIterator deserializeRecords(byte[] content) throws IOException { + checkState(this.readerSchema != null, "Reader's schema has to be non-null"); + return RecordIterator.getInstance(this, content); + } + + private static class RecordIterator implements ClosableIterator { + private byte[] content; + private final SizeAwareDataInputStream dis; + private final GenericDatumReader reader; + private final ThreadLocal decoderCache = new ThreadLocal<>(); - // 1. Read version for this data block - int version = dis.readInt(); - HoodieAvroDataBlockVersion logBlockVersion = new HoodieAvroDataBlockVersion(version); + private int totalRecords = 0; + private int readRecords = 0; - // Get schema from the header - Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + private RecordIterator(Schema readerSchema, Schema writerSchema, byte[] content) throws IOException { + this.content = content; - // If readerSchema was not present, use writerSchema - if (schema == null) { - schema = writerSchema; + this.dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(this.content))); + + // 1. Read version for this data block + int version = this.dis.readInt(); + HoodieAvroDataBlockVersion logBlockVersion = new HoodieAvroDataBlockVersion(version); + + this.reader = new GenericDatumReader<>(writerSchema, readerSchema); + + if (logBlockVersion.hasRecordCount()) { + this.totalRecords = this.dis.readInt(); + } } - GenericDatumReader reader = new GenericDatumReader<>(writerSchema, schema); - // 2. Get the total records - int totalRecords = 0; - if (logBlockVersion.hasRecordCount()) { - totalRecords = dis.readInt(); + public static RecordIterator getInstance(HoodieAvroDataBlock dataBlock, byte[] content) throws IOException { + // Get schema from the header + Schema writerSchema = new Schema.Parser().parse(dataBlock.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + return new RecordIterator(dataBlock.readerSchema, writerSchema, content); } - List records = new ArrayList<>(totalRecords); - // 3. Read the content - for (int i = 0; i < totalRecords; i++) { - int recordLength = dis.readInt(); - BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(getContent().get(), dis.getNumberOfBytesRead(), - recordLength, decoderCache.get()); - decoderCache.set(decoder); - IndexedRecord record = reader.read(null, decoder); - records.add(record); - dis.skipBytes(recordLength); + @Override + public void close() { + try { + this.dis.close(); + this.decoderCache.remove(); + this.content = null; + } catch (IOException e) { + // ignore + } + } + + @Override + public boolean hasNext() { + return readRecords < totalRecords; + } + + @Override + public IndexedRecord next() { + try { + int recordLength = this.dis.readInt(); + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(this.content, this.dis.getNumberOfBytesRead(), + recordLength, this.decoderCache.get()); + this.decoderCache.set(decoder); + IndexedRecord record = this.reader.read(null, decoder); + this.dis.skipBytes(recordLength); + this.readRecords++; + return record; + } catch (IOException e) { + throw new HoodieIOException("Unable to convert bytes to record.", e); + } } - dis.close(); - this.records = records; - // Free up content to be GC'd, deflate - deflate(); } //---------------------------------------------------------------------------------------- @@ -183,9 +206,7 @@ protected void deserializeRecords() throws IOException { */ @Deprecated public HoodieAvroDataBlock(List records, Schema schema) { - super(new HashMap<>(), new HashMap<>(), Option.empty(), Option.empty(), null, false); - this.records = records; - this.schema = schema; + super(records, Collections.singletonMap(HeaderMetadataType.SCHEMA, schema.toString()), new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD); } /** @@ -201,7 +222,7 @@ public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) int schemaLength = dis.readInt(); byte[] compressedSchema = new byte[schemaLength]; dis.readFully(compressedSchema, 0, schemaLength); - Schema writerSchema = new Schema.Parser().parse(HoodieAvroUtils.decompress(compressedSchema)); + Schema writerSchema = new Schema.Parser().parse(decompress(compressedSchema)); if (readerSchema == null) { readerSchema = writerSchema; @@ -224,6 +245,33 @@ public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) return new HoodieAvroDataBlock(records, readerSchema); } + private static byte[] compress(String text) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try { + OutputStream out = new DeflaterOutputStream(baos); + out.write(text.getBytes(StandardCharsets.UTF_8)); + out.close(); + } catch (IOException e) { + throw new HoodieIOException("IOException while compressing text " + text, e); + } + return baos.toByteArray(); + } + + private static String decompress(byte[] bytes) { + InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes)); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try { + byte[] buffer = new byte[8192]; + int len; + while ((len = in.read(buffer)) > 0) { + baos.write(buffer, 0, len); + } + return new String(baos.toByteArray(), StandardCharsets.UTF_8); + } catch (IOException e) { + throw new HoodieIOException("IOException while decompressing text", e); + } + } + @Deprecated public byte[] getBytes(Schema schema) throws IOException { @@ -232,10 +280,15 @@ public byte[] getBytes(Schema schema) throws IOException { DataOutputStream output = new DataOutputStream(baos); // 2. Compress and Write schema out - byte[] schemaContent = HoodieAvroUtils.compress(schema.toString()); + byte[] schemaContent = compress(schema.toString()); output.writeInt(schemaContent.length); output.write(schemaContent); + List records = new ArrayList<>(); + try (ClosableIterator recordItr = getRecordItr()) { + recordItr.forEachRemaining(records::add); + } + // 3. Write total number of records output.writeInt(records.size()); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java index 08909233a576b..0ff3a77b5007b 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCommandBlock.java @@ -18,7 +18,6 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; import org.apache.hadoop.fs.FSDataInputStream; @@ -44,9 +43,9 @@ public HoodieCommandBlock(Map header) { this(Option.empty(), null, false, Option.empty(), header, new HashMap<>()); } - private HoodieCommandBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, - Option blockContentLocation, Map header, - Map footer) { + public HoodieCommandBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, + Option blockContentLocation, Map header, + Map footer) { super(header, footer, blockContentLocation, content, inputStream, readBlockLazily); this.type = HoodieCommandBlockTypeEnum.values()[Integer.parseInt(header.get(HeaderMetadataType.COMMAND_BLOCK_TYPE))]; @@ -65,12 +64,4 @@ public HoodieLogBlockType getBlockType() { public byte[] getContentBytes() { return new byte[0]; } - - public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, - boolean readBlockLazily, long position, long blockSize, long blockEndPos, Map header, - Map footer) { - - return new HoodieCommandBlock(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer); - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java index 873be1315e50b..3e4f571588684 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieCorruptBlock.java @@ -18,7 +18,6 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; import org.apache.hadoop.fs.FSDataInputStream; @@ -32,15 +31,14 @@ */ public class HoodieCorruptBlock extends HoodieLogBlock { - private HoodieCorruptBlock(Option corruptedBytes, FSDataInputStream inputStream, boolean readBlockLazily, - Option blockContentLocation, Map header, - Map footer) { + public HoodieCorruptBlock(Option corruptedBytes, FSDataInputStream inputStream, boolean readBlockLazily, + Option blockContentLocation, Map header, + Map footer) { super(header, footer, blockContentLocation, corruptedBytes, inputStream, readBlockLazily); } @Override public byte[] getContentBytes() throws IOException { - if (!getContent().isPresent() && readBlockLazily) { // read content from disk inflate(); @@ -53,11 +51,4 @@ public HoodieLogBlockType getBlockType() { return HoodieLogBlockType.CORRUPT_BLOCK; } - public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, - Option corruptedBytes, boolean readBlockLazily, long position, long blockSize, long blockEndPos, - Map header, Map footer) { - - return new HoodieCorruptBlock(corruptedBytes, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer); - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java index 66c9571487dff..846b8d36a5091 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDataBlock.java @@ -18,25 +18,28 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FSDataInputStream; -import javax.annotation.Nonnull; - import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.function.Function; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; /** * DataBlock contains a list of records serialized using formats compatible with the base file format. * For each base file format there is a corresponding DataBlock format. - * + *

* The Datablock contains: * 1. Data Block version * 2. Total number of records in the block @@ -44,125 +47,225 @@ */ public abstract class HoodieDataBlock extends HoodieLogBlock { - protected List records; - protected Schema schema; - protected String keyField; + // TODO rebase records/content to leverage Either to warrant + // that they are mutex (used by read/write flows respectively) + private final Option> records; - public HoodieDataBlock(@Nonnull Map logBlockHeader, - @Nonnull Map logBlockFooter, - @Nonnull Option blockContentLocation, @Nonnull Option content, - FSDataInputStream inputStream, boolean readBlockLazily) { - super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily); - this.keyField = HoodieRecord.RECORD_KEY_METADATA_FIELD; - } + /** + * Key field's name w/in the record's schema + */ + private final String keyFieldName; - public HoodieDataBlock(@Nonnull List records, @Nonnull Map header, - @Nonnull Map footer, String keyField) { - this(header, footer, Option.empty(), Option.empty(), null, false); - this.records = records; - this.schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); - this.keyField = keyField; - } + private final boolean enablePointLookups; - protected HoodieDataBlock(Option content, @Nonnull FSDataInputStream inputStream, boolean readBlockLazily, - Option blockContentLocation, Schema readerSchema, - @Nonnull Map headers, @Nonnull Map footer, String keyField) { - this(headers, footer, blockContentLocation, content, inputStream, readBlockLazily); - this.schema = readerSchema; - this.keyField = keyField; - } + protected final Schema readerSchema; /** - * Util method to get a data block for the requested type. - * - * @param logDataBlockFormat - Data block type - * @param recordList - List of records that goes in the data block - * @param header - data block header - * @return Data block of the requested type. + * NOTE: This ctor is used on the write-path (ie when records ought to be written into the log) */ - public static HoodieLogBlock getBlock(HoodieLogBlockType logDataBlockFormat, List recordList, - Map header) { - return getBlock(logDataBlockFormat, recordList, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); + public HoodieDataBlock(List records, + Map header, + Map footer, + String keyFieldName) { + super(header, footer, Option.empty(), Option.empty(), null, false); + this.records = Option.of(records); + this.keyFieldName = keyFieldName; + // If no reader-schema has been provided assume writer-schema as one + this.readerSchema = getWriterSchema(super.getLogBlockHeader()); + this.enablePointLookups = false; } /** - * Util method to get a data block for the requested type. - * - * @param logDataBlockFormat - Data block type - * @param recordList - List of records that goes in the data block - * @param header - data block header - * @param keyField - FieldId to get the key from the records - * @return Data block of the requested type. + * NOTE: This ctor is used on the write-path (ie when records ought to be written into the log) */ - public static HoodieLogBlock getBlock(HoodieLogBlockType logDataBlockFormat, List recordList, - Map header, String keyField) { - switch (logDataBlockFormat) { - case AVRO_DATA_BLOCK: - return new HoodieAvroDataBlock(recordList, header, keyField); - case HFILE_DATA_BLOCK: - return new HoodieHFileDataBlock(recordList, header, keyField); - default: - throw new HoodieException("Data block format " + logDataBlockFormat + " not implemented"); - } + protected HoodieDataBlock(Option content, + FSDataInputStream inputStream, + boolean readBlockLazily, + Option blockContentLocation, + Option readerSchema, + Map headers, + Map footer, + String keyFieldName, + boolean enablePointLookups) { + super(headers, footer, blockContentLocation, content, inputStream, readBlockLazily); + this.records = Option.empty(); + this.keyFieldName = keyFieldName; + // If no reader-schema has been provided assume writer-schema as one + this.readerSchema = readerSchema.orElseGet(() -> getWriterSchema(super.getLogBlockHeader())); + this.enablePointLookups = enablePointLookups; } @Override public byte[] getContentBytes() throws IOException { // In case this method is called before realizing records from content - if (getContent().isPresent()) { - return getContent().get(); - } else if (readBlockLazily && !getContent().isPresent() && records == null) { - // read block lazily - createRecordsFromContentBytes(); + Option content = getContent(); + + checkState(content.isPresent() || records.isPresent(), "Block is in invalid state"); + + if (content.isPresent()) { + return content.get(); } - return serializeRecords(); + return serializeRecords(records.get()); } - public abstract HoodieLogBlockType getBlockType(); + protected static Schema getWriterSchema(Map logBlockHeader) { + return new Schema.Parser().parse(logBlockHeader.get(HeaderMetadataType.SCHEMA)); + } - public List getRecords() { - if (records == null) { - try { - // in case records are absent, read content lazily and then convert to IndexedRecords - createRecordsFromContentBytes(); - } catch (IOException io) { - throw new HoodieIOException("Unable to convert content bytes to records", io); - } + /** + * Returns all the records iterator contained w/in this block. + */ + public final ClosableIterator getRecordItr() { + if (records.isPresent()) { + return list2Iterator(records.get()); } - return records; + try { + // in case records are absent, read content lazily and then convert to IndexedRecords + return readRecordsFromBlockPayload(); + } catch (IOException io) { + throw new HoodieIOException("Unable to convert content bytes to records", io); + } + } + + public Schema getSchema() { + return readerSchema; } /** * Batch get of keys of interest. Implementation can choose to either do full scan and return matched entries or * do a seek based parsing and return matched entries. + * * @param keys keys of interest. * @return List of IndexedRecords for the keys of interest. - * @throws IOException + * @throws IOException in case of failures encountered when reading/parsing records */ - public List getRecords(List keys) throws IOException { - throw new UnsupportedOperationException("On demand batch get based on interested keys not supported"); - } + public final ClosableIterator getRecordItr(List keys) throws IOException { + boolean fullScan = keys.isEmpty(); + if (enablePointLookups && !fullScan) { + return lookupRecords(keys); + } - public Schema getSchema() { - // if getSchema was invoked before converting byte [] to records - if (records == null) { - getRecords(); + // Otherwise, we fetch all the records and filter out all the records, but the + // ones requested + ClosableIterator allRecords = getRecordItr(); + if (fullScan) { + return allRecords; } - return schema; + + HashSet keySet = new HashSet<>(keys); + return FilteringIterator.getInstance(allRecords, keySet, this::getRecordKey); } - protected void createRecordsFromContentBytes() throws IOException { + protected ClosableIterator readRecordsFromBlockPayload() throws IOException { if (readBlockLazily && !getContent().isPresent()) { // read log block contents from disk inflate(); } - deserializeRecords(); + try { + return deserializeRecords(getContent().get()); + } finally { + // Free up content to be GC'd by deflating the block + deflate(); + } + } + + protected ClosableIterator lookupRecords(List keys) throws IOException { + throw new UnsupportedOperationException( + String.format("Point lookups are not supported by this Data block type (%s)", getBlockType()) + ); + } + + protected abstract byte[] serializeRecords(List records) throws IOException; + + protected abstract ClosableIterator deserializeRecords(byte[] content) throws IOException; + + public abstract HoodieLogBlockType getBlockType(); + + protected Option getKeyField(Schema schema) { + return Option.ofNullable(schema.getField(keyFieldName)); + } + + protected Option getRecordKey(IndexedRecord record) { + return getKeyField(record.getSchema()) + .map(keyField -> record.get(keyField.pos())) + .map(Object::toString); + } + + /** + * Converts the given list to closable iterator. + */ + static ClosableIterator list2Iterator(List list) { + Iterator iterator = list.iterator(); + return new ClosableIterator() { + @Override + public void close() { + // ignored + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + }; } - protected abstract byte[] serializeRecords() throws IOException; + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * A {@link ClosableIterator} that supports filtering strategy with given keys. + * User should supply the key extraction function for fetching string format keys. + * + * @param the element type + */ + private static class FilteringIterator implements ClosableIterator { + private final ClosableIterator nested; // nested iterator + + private final Set keys; // the filtering keys + private final Function> keyExtract; // function to extract the key + + private T next; + + private FilteringIterator(ClosableIterator nested, Set keys, Function> keyExtract) { + this.nested = nested; + this.keys = keys; + this.keyExtract = keyExtract; + } + + public static FilteringIterator getInstance( + ClosableIterator nested, + Set keys, + Function> keyExtract) { + return new FilteringIterator<>(nested, keys, keyExtract); + } + + @Override + public void close() { + this.nested.close(); + } + + @Override + public boolean hasNext() { + while (this.nested.hasNext()) { + this.next = this.nested.next(); + if (keys.contains(keyExtract.apply(this.next).orElse(null))) { + return true; + } + } + return false; + } - protected abstract void deserializeRecords() throws IOException; + @Override + public T next() { + return this.next; + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java index 45534f7b51013..01159ab72dffe 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieDeleteBlock.java @@ -20,7 +20,6 @@ import org.apache.hudi.common.fs.SizeAwareDataInputStream; import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.SerializationUtils; import org.apache.hudi.exception.HoodieIOException; @@ -47,7 +46,7 @@ public HoodieDeleteBlock(HoodieKey[] keysToDelete, Map content, FSDataInputStream inputStream, boolean readBlockLazily, + public HoodieDeleteBlock(Option content, FSDataInputStream inputStream, boolean readBlockLazily, Option blockContentLocation, Map header, Map footer) { super(header, footer, blockContentLocation, content, inputStream, readBlockLazily); @@ -55,11 +54,12 @@ private HoodieDeleteBlock(Option content, FSDataInputStream inputStream, @Override public byte[] getContentBytes() throws IOException { + Option content = getContent(); // In case this method is called before realizing keys from content - if (getContent().isPresent()) { - return getContent().get(); - } else if (readBlockLazily && !getContent().isPresent() && keysToDelete == null) { + if (content.isPresent()) { + return content.get(); + } else if (readBlockLazily && keysToDelete == null) { // read block lazily getKeysToDelete(); } @@ -100,11 +100,4 @@ public HoodieLogBlockType getBlockType() { return HoodieLogBlockType.DELETE_BLOCK; } - public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, - boolean readBlockLazily, long position, long blockSize, long blockEndPos, Map header, - Map footer) throws IOException { - - return new HoodieDeleteBlock(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer); - } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java index 7f1fa2aa1d64a..557a0db7cbfad 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java @@ -18,19 +18,7 @@ package org.apache.hudi.common.table.log.block; -import org.apache.hudi.avro.HoodieAvroUtils; -import org.apache.hudi.common.fs.inline.InLineFSUtils; -import org.apache.hudi.common.fs.inline.InLineFileSystem; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.io.storage.HoodieHBaseKVComparator; -import org.apache.hudi.io.storage.HoodieHFileReader; - import org.apache.avro.Schema; -import org.apache.avro.Schema.Field; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; @@ -42,12 +30,20 @@ import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.hfile.HFileContext; import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder; -import org.apache.hadoop.hbase.util.Pair; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.fs.inline.InLineFSUtils; +import org.apache.hudi.common.fs.inline.InLineFileSystem; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.storage.HoodieHBaseKVComparator; +import org.apache.hudi.io.storage.HoodieHFileReader; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import javax.annotation.Nonnull; - import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Collections; @@ -56,7 +52,8 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; -import java.util.stream.Collectors; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; /** * HoodieHFileDataBlock contains a list of records stored inside an HFile format. It is used with the HFile @@ -64,27 +61,28 @@ */ public class HoodieHFileDataBlock extends HoodieDataBlock { private static final Logger LOG = LogManager.getLogger(HoodieHFileDataBlock.class); - private static Compression.Algorithm compressionAlgorithm = Compression.Algorithm.GZ; - private static int blockSize = 1 * 1024 * 1024; - private boolean enableInlineReading = false; - - public HoodieHFileDataBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option content, - boolean readBlockLazily, long position, long blockSize, long blockEndpos, - Schema readerSchema, Map header, - Map footer, boolean enableInlineReading, String keyField) { - super(content, inputStream, readBlockLazily, - Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), - readerSchema, header, footer, keyField); - this.enableInlineReading = enableInlineReading; - } - public HoodieHFileDataBlock(@Nonnull List records, @Nonnull Map header, - String keyField) { - super(records, header, new HashMap<>(), keyField); + private static final int DEFAULT_BLOCK_SIZE = 1024 * 1024; + + private final Option compressionAlgorithm; + + public HoodieHFileDataBlock(FSDataInputStream inputStream, + Option content, + boolean readBlockLazily, + HoodieLogBlockContentLocation logBlockContentLocation, + Option readerSchema, + Map header, + Map footer, + boolean enablePointLookups) { + super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, HoodieHFileReader.KEY_FIELD_NAME, enablePointLookups); + this.compressionAlgorithm = Option.empty(); } - public HoodieHFileDataBlock(@Nonnull List records, @Nonnull Map header) { - this(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); + public HoodieHFileDataBlock(List records, + Map header, + Compression.Algorithm compressionAlgorithm) { + super(records, header, new HashMap<>(), HoodieHFileReader.KEY_FIELD_NAME); + this.compressionAlgorithm = Option.of(compressionAlgorithm); } @Override @@ -93,43 +91,45 @@ public HoodieLogBlockType getBlockType() { } @Override - protected byte[] serializeRecords() throws IOException { - HFileContext context = new HFileContextBuilder().withBlockSize(blockSize).withCompression(compressionAlgorithm) + protected byte[] serializeRecords(List records) throws IOException { + HFileContext context = new HFileContextBuilder() + .withBlockSize(DEFAULT_BLOCK_SIZE) + .withCompression(compressionAlgorithm.get()) .build(); + Configuration conf = new Configuration(); CacheConfig cacheConfig = new CacheConfig(conf); ByteArrayOutputStream baos = new ByteArrayOutputStream(); FSDataOutputStream ostream = new FSDataOutputStream(baos, null); - HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig) - .withOutputStream(ostream).withFileContext(context).withComparator(new HoodieHBaseKVComparator()).create(); + // Use simple incrementing counter as a key + boolean useIntegerKey = !getRecordKey(records.get(0)).isPresent(); + // This is set here to avoid re-computing this in the loop + int keyWidth = useIntegerKey ? (int) Math.ceil(Math.log(records.size())) + 1 : -1; // Serialize records into bytes Map sortedRecordsMap = new TreeMap<>(); Iterator itr = records.iterator(); - boolean useIntegerKey = false; - int key = 0; - int keySize = 0; - Field keyField = records.get(0).getSchema().getField(this.keyField); - if (keyField == null) { - // Missing key metadata field so we should use an integer sequence key - useIntegerKey = true; - keySize = (int) Math.ceil(Math.log(records.size())) + 1; - } + + int id = 0; while (itr.hasNext()) { IndexedRecord record = itr.next(); String recordKey; if (useIntegerKey) { - recordKey = String.format("%" + keySize + "s", key++); + recordKey = String.format("%" + keyWidth + "s", id++); } else { - recordKey = record.get(keyField.pos()).toString(); + recordKey = getRecordKey(record).get(); } - byte[] recordBytes = HoodieAvroUtils.indexedRecordToBytes(record); + + final byte[] recordBytes = serializeRecord(record); ValidationUtils.checkState(!sortedRecordsMap.containsKey(recordKey), "Writing multiple records with same key not supported for " + this.getClass().getName()); sortedRecordsMap.put(recordKey, recordBytes); } + HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig) + .withOutputStream(ostream).withFileContext(context).withComparator(new HoodieHBaseKVComparator()).create(); + // Write the records sortedRecordsMap.forEach((recordKey, recordBytes) -> { try { @@ -148,65 +148,83 @@ protected byte[] serializeRecords() throws IOException { } @Override - protected void createRecordsFromContentBytes() throws IOException { - if (enableInlineReading) { - getRecords(Collections.emptyList()); - } else { - super.createRecordsFromContentBytes(); - } - } - - @Override - public List getRecords(List keys) throws IOException { - readWithInlineFS(keys); - return records; - } + protected ClosableIterator deserializeRecords(byte[] content) throws IOException { + checkState(readerSchema != null, "Reader's schema has to be non-null"); - private void readWithInlineFS(List keys) throws IOException { - boolean enableFullScan = keys.isEmpty(); // Get schema from the header Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); - // If readerSchema was not present, use writerSchema - if (schema == null) { - schema = writerSchema; - } - Configuration conf = new Configuration(); - CacheConfig cacheConf = new CacheConfig(conf); - Configuration inlineConf = new Configuration(); - inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName()); - Path inlinePath = InLineFSUtils.getInlineFilePath( - getBlockContentLocation().get().getLogFile().getPath(), - getBlockContentLocation().get().getLogFile().getPath().getFileSystem(conf).getScheme(), - getBlockContentLocation().get().getContentPositionInLogFile(), - getBlockContentLocation().get().getBlockSize()); - if (!enableFullScan) { - // HFile read will be efficient if keys are sorted, since on storage, records are sorted by key. This will avoid unnecessary seeks. - Collections.sort(keys); - } - HoodieHFileReader reader = new HoodieHFileReader(inlineConf, inlinePath, cacheConf, inlinePath.getFileSystem(inlineConf)); - List> logRecords = enableFullScan ? reader.readAllRecords(writerSchema, schema) : - reader.readRecords(keys, schema); - reader.close(); - this.records = logRecords.stream().map(t -> t.getSecond()).collect(Collectors.toList()); + // Read the content + HoodieHFileReader reader = new HoodieHFileReader<>(content); + // Sets up the writer schema + reader.withSchema(writerSchema); + Iterator recordIterator = reader.getRecordIterator(readerSchema); + return new ClosableIterator() { + @Override + public void close() { + reader.close(); + } + + @Override + public boolean hasNext() { + return recordIterator.hasNext(); + } + + @Override + public IndexedRecord next() { + return recordIterator.next(); + } + }; } + // TODO abstract this w/in HoodieDataBlock @Override - protected void deserializeRecords() throws IOException { - // Get schema from the header - Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + protected ClosableIterator lookupRecords(List keys) throws IOException { + HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get(); - // If readerSchema was not present, use writerSchema - if (schema == null) { - schema = writerSchema; - } + // NOTE: It's important to extend Hadoop configuration here to make sure configuration + // is appropriately carried over + Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf()); + inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName()); - // Read the content - HoodieHFileReader reader = new HoodieHFileReader<>(getContent().get()); - List> records = reader.readAllRecords(writerSchema, schema); - this.records = records.stream().map(t -> t.getSecond()).collect(Collectors.toList()); + Path inlinePath = InLineFSUtils.getInlineFilePath( + blockContentLoc.getLogFile().getPath(), + blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(), + blockContentLoc.getContentPositionInLogFile(), + blockContentLoc.getBlockSize()); + + // HFile read will be efficient if keys are sorted, since on storage, records are sorted by key. This will avoid unnecessary seeks. + Collections.sort(keys); + + final HoodieHFileReader reader = + new HoodieHFileReader<>(inlineConf, inlinePath, new CacheConfig(inlineConf), inlinePath.getFileSystem(inlineConf)); + // Get writer's schema from the header + final ClosableIterator recordIterator = reader.getRecordIterator(keys, readerSchema); + return new ClosableIterator() { + @Override + public boolean hasNext() { + return recordIterator.hasNext(); + } + + @Override + public IndexedRecord next() { + return recordIterator.next(); + } - // Free up content to be GC'd, deflate - deflate(); + @Override + public void close() { + recordIterator.close(); + reader.close(); + } + }; + } + + private byte[] serializeRecord(IndexedRecord record) { + Option keyField = getKeyField(record.getSchema()); + // Reset key value w/in the record to avoid duplicating the key w/in payload + if (keyField.isPresent()) { + record.put(keyField.get().pos(), StringUtils.EMPTY_STRING); + } + return HoodieAvroUtils.indexedRecordToBytes(record); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java index 2fbcd992087e2..d514f28ce1c4a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieLogBlock.java @@ -18,15 +18,18 @@ package org.apache.hudi.common.table.log.block; +import org.apache.hadoop.conf.Configuration; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.TypeUtils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hadoop.fs.FSDataInputStream; import javax.annotation.Nonnull; +import javax.annotation.Nullable; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; @@ -36,6 +39,8 @@ import java.util.HashMap; import java.util.Map; +import static org.apache.hudi.common.util.ValidationUtils.checkState; + /** * Abstract class defining a block in HoodieLogFile. */ @@ -58,14 +63,17 @@ public abstract class HoodieLogBlock { // TODO : change this to just InputStream so this works for any FileSystem // create handlers to return specific type of inputstream based on FS // input stream corresponding to the log file where this logBlock belongs - protected FSDataInputStream inputStream; + private final FSDataInputStream inputStream; // Toggle flag, whether to read blocks lazily (I/O intensive) or not (Memory intensive) protected boolean readBlockLazily; - public HoodieLogBlock(@Nonnull Map logBlockHeader, + public HoodieLogBlock( + @Nonnull Map logBlockHeader, @Nonnull Map logBlockFooter, - @Nonnull Option blockContentLocation, @Nonnull Option content, - FSDataInputStream inputStream, boolean readBlockLazily) { + @Nonnull Option blockContentLocation, + @Nonnull Option content, + @Nullable FSDataInputStream inputStream, + boolean readBlockLazily) { this.logBlockHeader = logBlockHeader; this.logBlockFooter = logBlockFooter; this.blockContentLocation = blockContentLocation; @@ -109,7 +117,25 @@ public Option getContent() { * Type of the log block WARNING: This enum is serialized as the ordinal. Only add new enums at the end. */ public enum HoodieLogBlockType { - COMMAND_BLOCK, DELETE_BLOCK, CORRUPT_BLOCK, AVRO_DATA_BLOCK, HFILE_DATA_BLOCK + COMMAND_BLOCK(":command"), + DELETE_BLOCK(":delete"), + CORRUPT_BLOCK(":corrupted"), + AVRO_DATA_BLOCK("avro"), + HFILE_DATA_BLOCK("hfile"), + PARQUET_DATA_BLOCK("parquet"); + + private static final Map ID_TO_ENUM_MAP = + TypeUtils.getValueToEnumMap(HoodieLogBlockType.class, e -> e.id); + + private final String id; + + HoodieLogBlockType(String id) { + this.id = id; + } + + public static HoodieLogBlockType fromId(String id) { + return ID_TO_ENUM_MAP.get(id); + } } /** @@ -132,7 +158,8 @@ public enum FooterMetadataType { * intensive CompactedScanner, the location helps to lazily read contents from the log file */ public static final class HoodieLogBlockContentLocation { - + // Hadoop Config required to access the file + private final Configuration hadoopConf; // The logFile that contains this block private final HoodieLogFile logFile; // The filePosition in the logFile for the contents of this block @@ -142,14 +169,22 @@ public static final class HoodieLogBlockContentLocation { // The final position where the complete block ends private final long blockEndPos; - HoodieLogBlockContentLocation(HoodieLogFile logFile, long contentPositionInLogFile, long blockSize, - long blockEndPos) { + public HoodieLogBlockContentLocation(Configuration hadoopConf, + HoodieLogFile logFile, + long contentPositionInLogFile, + long blockSize, + long blockEndPos) { + this.hadoopConf = hadoopConf; this.logFile = logFile; this.contentPositionInLogFile = contentPositionInLogFile; this.blockSize = blockSize; this.blockEndPos = blockEndPos; } + public Configuration getHadoopConf() { + return hadoopConf; + } + public HoodieLogFile getLogFile() { return logFile; } @@ -210,24 +245,27 @@ public static Map getLogMetadata(DataInputStream dis * Read or Skip block content of a log block in the log file. Depends on lazy reading enabled in * {@link HoodieMergedLogRecordScanner} */ - public static byte[] readOrSkipContent(FSDataInputStream inputStream, Integer contentLength, boolean readBlockLazily) + public static Option tryReadContent(FSDataInputStream inputStream, Integer contentLength, boolean readLazily) throws IOException { - byte[] content = null; - if (!readBlockLazily) { - // Read the contents in memory - content = new byte[contentLength]; - inputStream.readFully(content, 0, contentLength); - } else { + if (readLazily) { // Seek to the end of the content block inputStream.seek(inputStream.getPos() + contentLength); + return Option.empty(); } - return content; + + // TODO re-use buffer if stream is backed by buffer + // Read the contents in memory + byte[] content = new byte[contentLength]; + inputStream.readFully(content, 0, contentLength); + return Option.of(content); } /** * When lazyReading of blocks is turned on, inflate the content of a log block from disk. */ protected void inflate() throws HoodieIOException { + checkState(!content.isPresent(), "Block has already been inflated"); + checkState(inputStream != null, "Block should have input-stream provided"); try { content = Option.of(new byte[(int) this.getBlockContentLocation().get().getBlockSize()]); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java new file mode 100644 index 0000000000000..5e7bef90a08ba --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieParquetDataBlock.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.log.block; + +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.common.fs.inline.InLineFSUtils; +import org.apache.hudi.common.fs.inline.InLineFileSystem; +import org.apache.hudi.common.util.ClosableIterator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetReaderIterator; +import org.apache.hudi.io.storage.HoodieAvroParquetConfig; +import org.apache.hudi.io.storage.HoodieParquetStreamWriter; + +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.io.InputFile; + +import javax.annotation.Nonnull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * HoodieParquetDataBlock contains a list of records serialized using Parquet. + */ +public class HoodieParquetDataBlock extends HoodieDataBlock { + + private final Option compressionCodecName; + + public HoodieParquetDataBlock(FSDataInputStream inputStream, + Option content, + boolean readBlockLazily, + HoodieLogBlockContentLocation logBlockContentLocation, + Option readerSchema, + Map header, + Map footer, + String keyField) { + super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, keyField, false); + + this.compressionCodecName = Option.empty(); + } + + public HoodieParquetDataBlock( + @Nonnull List records, + @Nonnull Map header, + @Nonnull String keyField, + @Nonnull CompressionCodecName compressionCodecName + ) { + super(records, header, new HashMap<>(), keyField); + + this.compressionCodecName = Option.of(compressionCodecName); + } + + @Override + public HoodieLogBlockType getBlockType() { + return HoodieLogBlockType.PARQUET_DATA_BLOCK; + } + + @Override + protected byte[] serializeRecords(List records) throws IOException { + if (records.size() == 0) { + return new byte[0]; + } + + Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA)); + + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( + new AvroSchemaConverter().convert(writerSchema), writerSchema, Option.empty()); + + HoodieAvroParquetConfig avroParquetConfig = + new HoodieAvroParquetConfig( + writeSupport, + compressionCodecName.get(), + ParquetWriter.DEFAULT_BLOCK_SIZE, + ParquetWriter.DEFAULT_PAGE_SIZE, + 1024 * 1024 * 1024, + new Configuration(), + Double.parseDouble(String.valueOf(0.1)));//HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue())); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + try (FSDataOutputStream outputStream = new FSDataOutputStream(baos)) { + try (HoodieParquetStreamWriter parquetWriter = new HoodieParquetStreamWriter<>(outputStream, avroParquetConfig)) { + for (IndexedRecord record : records) { + String recordKey = getRecordKey(record).orElse(null); + parquetWriter.writeAvro(recordKey, record); + } + outputStream.flush(); + } + } + + return baos.toByteArray(); + } + + public static ClosableIterator getProjectedParquetRecordsIterator(Configuration conf, + Schema readerSchema, + InputFile inputFile) throws IOException { + AvroReadSupport.setAvroReadSchema(conf, readerSchema); + AvroReadSupport.setRequestedProjection(conf, readerSchema); + + ParquetReader reader = + AvroParquetReader.builder(inputFile).withConf(conf).build(); + return new ParquetReaderIterator<>(reader); + } + + /** + * NOTE: We're overriding the whole reading sequence to make sure we properly respect + * the requested Reader's schema and only fetch the columns that have been explicitly + * requested by the caller (providing projected Reader's schema) + */ + @Override + protected ClosableIterator readRecordsFromBlockPayload() throws IOException { + HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get(); + + // NOTE: It's important to extend Hadoop configuration here to make sure configuration + // is appropriately carried over + Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf()); + inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName()); + + Path inlineLogFilePath = InLineFSUtils.getInlineFilePath( + blockContentLoc.getLogFile().getPath(), + blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(), + blockContentLoc.getContentPositionInLogFile(), + blockContentLoc.getBlockSize()); + + return getProjectedParquetRecordsIterator( + inlineConf, + readerSchema, + HadoopInputFile.fromPath(inlineLogFilePath, inlineConf)); + } + + @Override + protected ClosableIterator deserializeRecords(byte[] content) { + throw new UnsupportedOperationException("Should not be invoked"); + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java index c7473bd7d59d5..36dd5368d4a63 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant.State; import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieIOException; @@ -70,7 +71,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION, CLEAN_EXTENSION, REQUESTED_CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, INFLIGHT_COMPACTION_EXTENSION, REQUESTED_COMPACTION_EXTENSION, - INFLIGHT_RESTORE_EXTENSION, RESTORE_EXTENSION, + REQUESTED_RESTORE_EXTENSION, INFLIGHT_RESTORE_EXTENSION, RESTORE_EXTENSION, ROLLBACK_EXTENSION, REQUESTED_ROLLBACK_EXTENSION, INFLIGHT_ROLLBACK_EXTENSION, REQUESTED_REPLACE_COMMIT_EXTENSION, INFLIGHT_REPLACE_COMMIT_EXTENSION, REPLACE_COMMIT_EXTENSION)); private static final Logger LOG = LogManager.getLogger(HoodieActiveTimeline.class); @@ -259,6 +260,26 @@ public Option getInstantDetails(HoodieInstant instant) { return readDataFromPath(detailPath); } + /** + * Get the last instant with valid schema, and convert this to HoodieCommitMetadata + */ + public Option> getLastCommitMetadataWithValidSchema() { + List completed = getCommitsTimeline().filterCompletedInstants().getInstants() + .sorted(Comparator.comparing(HoodieInstant::getTimestamp).reversed()).collect(Collectors.toList()); + for (HoodieInstant instant : completed) { + try { + HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( + getInstantDetails(instant).get(), HoodieCommitMetadata.class); + if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY))) { + return Option.of(Pair.of(instant, commitMetadata)); + } + } catch (IOException e) { + LOG.warn("Failed to convert instant to HoodieCommitMetadata: " + instant.toString()); + } + } + return Option.empty(); + } + /** * Get the last instant with valid data, and convert this to HoodieCommitMetadata */ @@ -289,6 +310,11 @@ public Option readRollbackInfoAsBytes(HoodieInstant instant) { return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName())); } + public Option readRestoreInfoAsBytes(HoodieInstant instant) { + // Rollback metadata are always stored only in timeline .hoodie + return readDataFromPath(new Path(metaClient.getMetaPath(), instant.getFileName())); + } + //----------------------------------------------------------------- // BEGIN - COMPACTION RELATED META-DATA MANAGEMENT. //----------------------------------------------------------------- @@ -429,6 +455,21 @@ public HoodieInstant transitionRollbackRequestedToInflight(HoodieInstant request return inflight; } + /** + * Transition Restore State from requested to inflight. + * + * @param requestedInstant requested instant + * @return commit instant + */ + public HoodieInstant transitionRestoreRequestedToInflight(HoodieInstant requestedInstant) { + ValidationUtils.checkArgument(requestedInstant.getAction().equals(HoodieTimeline.RESTORE_ACTION), "Transition to inflight requested for a restore instant with diff action " + + requestedInstant.toString()); + ValidationUtils.checkArgument(requestedInstant.isRequested(), "Transition to inflight requested for an instant not in requested state " + requestedInstant.toString()); + HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, RESTORE_ACTION, requestedInstant.getTimestamp()); + transitionState(requestedInstant, inflight, Option.empty()); + return inflight; + } + /** * Transition replace requested file to replace inflight. * @@ -599,6 +640,13 @@ public void saveToRollbackRequested(HoodieInstant instant, Option conten createFileInMetaPath(instant.getFileName(), content, false); } + public void saveToRestoreRequested(HoodieInstant instant, Option content) { + ValidationUtils.checkArgument(instant.getAction().equals(HoodieTimeline.RESTORE_ACTION)); + ValidationUtils.checkArgument(instant.getState().equals(State.REQUESTED)); + // Plan is stored in meta path + createFileInMetaPath(instant.getFileName(), content, false); + } + private void createFileInMetaPath(String filename, Option content, boolean allowOverwrite) { Path fullPath = new Path(metaClient.getMetaPath(), filename); if (allowOverwrite || metaClient.getTimelineLayoutVersion().isNullVersion()) { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java index 5ad3fa7a9f215..ddfe22ac9e02e 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieArchivedTimeline.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.Option; @@ -54,10 +55,12 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.Spliterator; +import java.util.Spliterators; import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; +import java.util.stream.StreamSupport; /** * Represents the Archived Timeline for the Hoodie table. Instants for the last 12 hours (configurable) is in the @@ -79,13 +82,13 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { private static final String ACTION_TYPE_KEY = "actionType"; private static final String ACTION_STATE = "actionState"; private HoodieTableMetaClient metaClient; - private Map readCommits = new HashMap<>(); + private final Map readCommits = new HashMap<>(); private static final Logger LOG = LogManager.getLogger(HoodieArchivedTimeline.class); /** - * Loads instants between (startTs, endTs]. - * Note that there is no lazy loading, so this may not work if really long time range (endTs-startTs) is specified. + * Loads all the archived instants. + * Note that there is no lazy loading, so this may not work if the archived timeline range is really long. * TBD: Should we enforce maximum time range? */ public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) { @@ -96,6 +99,19 @@ public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) { this.details = (Function> & Serializable) this::getInstantDetails; } + /** + * Loads completed instants from startTs(inclusive). + * Note that there is no lazy loading, so this may not work if really early startTs is specified. + */ + public HoodieArchivedTimeline(HoodieTableMetaClient metaClient, String startTs) { + this.metaClient = metaClient; + setInstants(loadInstants(new StartTsFilter(startTs), true, + record -> HoodieInstant.State.COMPLETED.toString().equals(record.get(ACTION_STATE).toString()))); + // multiple casts will make this lambda serializable - + // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 + this.details = (Function> & Serializable) this::getInstantDetails; + } + /** * For serialization and de-serialization only. * @@ -235,15 +251,14 @@ private List loadInstants(TimeRangeFilter filter, boolean loadIns HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); // TODO If we can store additional metadata in datablock, we can skip parsing records // (such as startTime, endTime of records in the block) - List records = blk.getRecords(); - // Filter blocks in desired time window - instantsInRange.addAll( - records.stream() - .filter(r -> commitsFilter.apply((GenericRecord) r)) - .map(r -> readCommit((GenericRecord) r, loadInstantDetails)) - .filter(c -> filter == null || filter.isInRange(c)) - .collect(Collectors.toList()) - ); + try (ClosableIterator itr = blk.getRecordItr()) { + StreamSupport.stream(Spliterators.spliteratorUnknownSize(itr, Spliterator.IMMUTABLE), true) + // Filter blocks in desired time window + .filter(r -> commitsFilter.apply((GenericRecord) r)) + .map(r -> readCommit((GenericRecord) r, loadInstantDetails)) + .filter(c -> filter == null || filter.isInRange(c)) + .forEach(instantsInRange::add); + } } if (filter != null) { @@ -300,6 +315,19 @@ public boolean isInRange(HoodieInstant instant) { } } + private static class StartTsFilter extends TimeRangeFilter { + private final String startTs; + + public StartTsFilter(String startTs) { + super(startTs, null); // endTs is never used + this.startTs = startTs; + } + + public boolean isInRange(HoodieInstant instant) { + return HoodieTimeline.compareTimestamps(instant.getTimestamp(), GREATER_THAN_OR_EQUALS, startTs); + } + } + /** * Sort files by reverse order of version suffix in file name. */ @@ -330,7 +358,7 @@ public HoodieDefaultTimeline getWriteTimeline() { // filter in-memory instants Set validActions = CollectionUtils.createSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, COMPACTION_ACTION, REPLACE_COMMIT_ACTION); return new HoodieDefaultTimeline(getInstants().filter(i -> - readCommits.keySet().contains(i.getTimestamp())) + readCommits.containsKey(i.getTimestamp())) .filter(s -> validActions.contains(s.getAction())), details); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java index a8df62c6496ae..9cd0883126495 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstant.java @@ -166,6 +166,7 @@ public String getFileName() { } } else if (HoodieTimeline.RESTORE_ACTION.equals(action)) { return isInflight() ? HoodieTimeline.makeInflightRestoreFileName(timestamp) + : isRequested() ? HoodieTimeline.makeRequestedRestoreFileName(timestamp) : HoodieTimeline.makeRestoreFileName(timestamp); } else if (HoodieTimeline.REPLACE_COMMIT_ACTION.equals(action)) { return isInflight() ? HoodieTimeline.makeInflightReplaceFileName(timestamp) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java index 6ea44a83007d1..25b9c2ec6f2e4 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieTimeline.java @@ -78,6 +78,7 @@ public interface HoodieTimeline extends Serializable { String REQUESTED_COMPACTION_SUFFIX = StringUtils.join(COMPACTION_ACTION, REQUESTED_EXTENSION); String REQUESTED_COMPACTION_EXTENSION = StringUtils.join(".", REQUESTED_COMPACTION_SUFFIX); String INFLIGHT_COMPACTION_EXTENSION = StringUtils.join(".", COMPACTION_ACTION, INFLIGHT_EXTENSION); + String REQUESTED_RESTORE_EXTENSION = "." + RESTORE_ACTION + REQUESTED_EXTENSION; String INFLIGHT_RESTORE_EXTENSION = "." + RESTORE_ACTION + INFLIGHT_EXTENSION; String RESTORE_EXTENSION = "." + RESTORE_ACTION; String INFLIGHT_REPLACE_COMMIT_EXTENSION = "." + REPLACE_COMMIT_ACTION + INFLIGHT_EXTENSION; @@ -386,6 +387,10 @@ static String makeRequestedRollbackFileName(String instant) { return StringUtils.join(instant, HoodieTimeline.REQUESTED_ROLLBACK_EXTENSION); } + static String makeRequestedRestoreFileName(String instant) { + return StringUtils.join(instant, HoodieTimeline.REQUESTED_RESTORE_EXTENSION); + } + static String makeInflightRollbackFileName(String instant) { return StringUtils.join(instant, HoodieTimeline.INFLIGHT_ROLLBACK_EXTENSION); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java index 32e42ee58ac27..70a23f1b4c0fb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/TimelineMetadataUtils.java @@ -26,6 +26,7 @@ import org.apache.hudi.avro.model.HoodieReplaceCommitMetadata; import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata; import org.apache.hudi.avro.model.HoodieRestoreMetadata; +import org.apache.hudi.avro.model.HoodieRestorePlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; import org.apache.hudi.avro.model.HoodieRollbackPlan; @@ -77,10 +78,8 @@ public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbac for (HoodieRollbackStat stat : rollbackStats) { Map rollbackLogFiles = stat.getCommandBlocksCount().keySet().stream() .collect(Collectors.toMap(f -> f.getPath().toString(), FileStatus::getLen)); - Map probableLogFiles = stat.getWrittenLogFileSizeMap().keySet().stream() - .collect(Collectors.toMap(f -> f.getPath().toString(), FileStatus::getLen)); HoodieRollbackPartitionMetadata metadata = new HoodieRollbackPartitionMetadata(stat.getPartitionPath(), - stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles(), rollbackLogFiles, probableLogFiles); + stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles(), rollbackLogFiles); partitionMetadataBuilder.put(stat.getPartitionPath(), metadata); totalDeleted += stat.getSuccessDeleteFiles().size(); } @@ -114,6 +113,10 @@ public static Option serializeRollbackPlan(HoodieRollbackPlan rollbackPl return serializeAvroMetadata(rollbackPlan, HoodieRollbackPlan.class); } + public static Option serializeRestorePlan(HoodieRestorePlan restorePlan) throws IOException { + return serializeAvroMetadata(restorePlan, HoodieRestorePlan.class); + } + public static Option serializeCleanMetadata(HoodieCleanMetadata metadata) throws IOException { return serializeAvroMetadata(metadata, HoodieCleanMetadata.class); } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java index 92e6171b68327..208d7ef2ba456 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java @@ -380,6 +380,19 @@ protected boolean isBaseFileDueToPendingCompaction(HoodieBaseFile baseFile) { && baseFile.getCommitTime().equals(compactionWithInstantTime.get().getKey()); } + /** + * With async clustering, it is possible to see partial/complete base-files due to inflight-clustering, Ignore those + * base-files. + * + * @param baseFile base File + */ + protected boolean isBaseFileDueToPendingClustering(HoodieBaseFile baseFile) { + List pendingReplaceInstants = + metaClient.getActiveTimeline().filterPendingReplaceTimeline().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); + + return !pendingReplaceInstants.isEmpty() && pendingReplaceInstants.contains(baseFile.getCommitTime()); + } + /** * Returns true if the file-group is under pending-compaction and the file-slice' baseInstant matches compaction * Instant. @@ -401,7 +414,7 @@ protected boolean isFileSliceAfterPendingCompaction(FileSlice fileSlice) { */ protected FileSlice filterBaseFileAfterPendingCompaction(FileSlice fileSlice) { if (isFileSliceAfterPendingCompaction(fileSlice)) { - LOG.info("File Slice (" + fileSlice + ") is in pending compaction"); + LOG.debug("File Slice (" + fileSlice + ") is in pending compaction"); // Base file is filtered out of the file-slice as the corresponding compaction // instant not completed yet. FileSlice transformed = @@ -492,7 +505,7 @@ public final Stream getLatestBaseFilesBeforeOrOn(String partitio .map(fileGroup -> Option.fromJavaOptional(fileGroup.getAllBaseFiles() .filter(baseFile -> HoodieTimeline.compareTimestamps(baseFile.getCommitTime(), HoodieTimeline.LESSER_THAN_OR_EQUALS, maxCommitTime )) - .filter(df -> !isBaseFileDueToPendingCompaction(df)).findFirst())) + .filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)).findFirst())) .filter(Option::isPresent).map(Option::get) .map(df -> addBootstrapBaseFileIfPresent(new HoodieFileGroupId(partitionPath, df.getFileId()), df)); } finally { @@ -511,7 +524,7 @@ public final Option getBaseFileOn(String partitionStr, String in } else { return fetchHoodieFileGroup(partitionPath, fileId).map(fileGroup -> fileGroup.getAllBaseFiles() .filter(baseFile -> HoodieTimeline.compareTimestamps(baseFile.getCommitTime(), HoodieTimeline.EQUALS, - instantTime)).filter(df -> !isBaseFileDueToPendingCompaction(df)).findFirst().orElse(null)) + instantTime)).filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)).findFirst().orElse(null)) .map(df -> addBootstrapBaseFileIfPresent(new HoodieFileGroupId(partitionPath, fileId), df)); } } finally { @@ -547,7 +560,7 @@ public final Stream getLatestBaseFilesInRange(List commi .filter(fileGroup -> !isFileGroupReplacedBeforeAny(fileGroup.getFileGroupId(), commitsToReturn)) .map(fileGroup -> Pair.of(fileGroup.getFileGroupId(), Option.fromJavaOptional( fileGroup.getAllBaseFiles().filter(baseFile -> commitsToReturn.contains(baseFile.getCommitTime()) - && !isBaseFileDueToPendingCompaction(baseFile)).findFirst()))).filter(p -> p.getValue().isPresent()) + && !isBaseFileDueToPendingCompaction(baseFile) && !isBaseFileDueToPendingClustering(baseFile)).findFirst()))).filter(p -> p.getValue().isPresent()) .map(p -> addBootstrapBaseFileIfPresent(p.getKey(), p.getValue().get())); } finally { readLock.unlock(); @@ -563,7 +576,7 @@ public final Stream getAllBaseFiles(String partitionStr) { return fetchAllBaseFiles(partitionPath) .filter(df -> !isFileGroupReplaced(partitionPath, df.getFileId())) .filter(df -> visibleCommitsAndCompactionTimeline.containsOrBeforeTimelineStarts(df.getCommitTime())) - .filter(df -> !isBaseFileDueToPendingCompaction(df)) + .filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)) .map(df -> addBootstrapBaseFileIfPresent(new HoodieFileGroupId(partitionPath, df.getFileId()), df)); } finally { readLock.unlock(); @@ -953,7 +966,7 @@ public Stream fetchLatestBaseFiles(final String partitionPath) { protected Option getLatestBaseFile(HoodieFileGroup fileGroup) { return Option - .fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> !isBaseFileDueToPendingCompaction(df)).findFirst()); + .fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> !isBaseFileDueToPendingCompaction(df) && !isBaseFileDueToPendingClustering(df)).findFirst()); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java index b77b0d3a82521..e2342edc3a351 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/FileSystemViewStorageConfig.java @@ -240,7 +240,7 @@ public Builder withMaxMemoryForView(Long maxMemoryForView) { return this; } - public Builder withRemoteTimelineClientTimeoutSecs(Long timelineClientTimeoutSecs) { + public Builder withRemoteTimelineClientTimeoutSecs(Integer timelineClientTimeoutSecs) { fileSystemViewStorageConfig.setValue(REMOTE_TIMEOUT_SECS, timelineClientTimeoutSecs.toString()); return this; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTablePreCommitFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTablePreCommitFileSystemView.java index 76f7e3ca5e388..7401617a6abb6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTablePreCommitFileSystemView.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/view/HoodieTablePreCommitFileSystemView.java @@ -22,7 +22,6 @@ import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.util.Option; import java.util.Collections; import java.util.List; @@ -75,16 +74,12 @@ public final Stream getLatestBaseFiles(String partitionStr) { new HoodieBaseFile(new Path(tableMetaClient.getBasePath(), writeStat.getPath()).toString()))); Stream committedBaseFiles = this.completedCommitsFileSystemView.getLatestBaseFiles(partitionStr); - Stream baseFilesForCommittedFileIds = committedBaseFiles - // Remove files replaced by current inflight commit - .filter(baseFile -> !replacedFileIdsForPartition.contains(baseFile.getFileId())) - // if there is new version of file created by inflight commit, use that file instead of committed version - .map(baseFile -> { - HoodieBaseFile fileIdNewVersionExists = newFilesWrittenForPartition.remove(baseFile.getFileId()); - return Option.ofNullable(fileIdNewVersionExists).orElse(baseFile); - }); - - Stream baseFilesWithNewFileIds = newFilesWrittenForPartition.values().stream(); - return Stream.concat(baseFilesForCommittedFileIds, baseFilesWithNewFileIds); + Map allFileIds = committedBaseFiles + // Remove files replaced by current inflight commit + .filter(baseFile -> !replacedFileIdsForPartition.contains(baseFile.getFileId())) + .collect(Collectors.toMap(HoodieBaseFile::getFileId, baseFile -> baseFile)); + + allFileIds.putAll(newFilesWrittenForPartition); + return allFileIds.values().stream(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java index 25a2bec5baff2..7ec6110d723ab 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/BaseFileUtils.java @@ -167,18 +167,35 @@ public abstract Map readFooter(Configuration configuration, bool * Fetch {@link HoodieKey}s from the given data file. * @param configuration configuration to build fs object * @param filePath The data file path - * @return {@link List} of {@link HoodieKey}s fetched from the parquet file + * @return {@link List} of {@link HoodieKey}s fetched from the data file */ - public abstract List fetchRecordKeyPartitionPath(Configuration configuration, Path filePath); + public abstract List fetchHoodieKeys(Configuration configuration, Path filePath); + + /** + * Provides a closable iterator for reading the given data file. + * @param configuration configuration to build fs object + * @param filePath The data file path + * @param keyGeneratorOpt instance of KeyGenerator. + * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the file + */ + public abstract ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath, Option keyGeneratorOpt); + + /** + * Provides a closable iterator for reading the given data file. + * @param configuration configuration to build fs object + * @param filePath The data file path + * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the file + */ + public abstract ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath); /** * Fetch {@link HoodieKey}s from the given data file. * @param configuration configuration to build fs object * @param filePath The data file path * @param keyGeneratorOpt instance of KeyGenerator. - * @return {@link List} of {@link HoodieKey}s fetched from the parquet file + * @return {@link List} of {@link HoodieKey}s fetched from the data file */ - public abstract List fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option keyGeneratorOpt); + public abstract List fetchHoodieKeys(Configuration configuration, Path filePath, Option keyGeneratorOpt); /** * Read the Avro schema of the data file. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/BinaryUtil.java b/hudi-common/src/main/java/org/apache/hudi/common/util/BinaryUtil.java index 0c7e898957670..9fec2c8cf5924 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/BinaryUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/BinaryUtil.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.util; import java.nio.charset.Charset; +import java.util.zip.CRC32; public class BinaryUtil { @@ -187,5 +188,14 @@ public static long convertBytesToLong(byte[] bytes) { } return temp; } + + /** + * Generate a checksum for a given set of bytes. + */ + public static long generateChecksum(byte[] data) { + CRC32 crc = new CRC32(); + crc.update(data); + return crc.getValue(); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CleanerUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CleanerUtils.java index 9f63bfa3da4c4..a3a1305667f6a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/CleanerUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CleanerUtils.java @@ -35,6 +35,9 @@ import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataV2MigrationHandler; import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + import java.io.IOException; import java.util.HashMap; import java.util.List; @@ -43,6 +46,9 @@ import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; public class CleanerUtils { + + private static final Logger LOG = LogManager.getLogger(CleanerUtils.class); + public static final Integer CLEAN_METADATA_VERSION_1 = CleanMetadataV1MigrationHandler.VERSION; public static final Integer CLEAN_METADATA_VERSION_2 = CleanMetadataV2MigrationHandler.VERSION; public static final Integer LATEST_CLEAN_METADATA_VERSION = CLEAN_METADATA_VERSION_2; @@ -131,6 +137,7 @@ public static void rollbackFailedWrites(HoodieFailedWritesCleaningPolicy cleanin // No need to do any special cleanup for failed operations during clean return; } else if (cleaningPolicy.isLazy()) { + LOG.info("Cleaned failed attempts if any"); // Perform rollback of failed operations for all types of actions during clean rollbackFailedWritesFunc.apply(); return; @@ -140,6 +147,7 @@ public static void rollbackFailedWrites(HoodieFailedWritesCleaningPolicy cleanin case COMMIT_ACTION: // For any other actions, perform rollback of failed writes if (cleaningPolicy.isEager()) { + LOG.info("Cleaned failed attempts if any"); rollbackFailedWritesFunc.apply(); return; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java index 15e53705b0d0c..9d741a03f82ec 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ClusteringUtils.java @@ -215,6 +215,12 @@ private static Map buildMetrics(List fileSlices) { } public static List getPendingClusteringInstantTimes(HoodieTableMetaClient metaClient) { - return metaClient.getActiveTimeline().filterPendingReplaceTimeline().getInstants().collect(Collectors.toList()); + return metaClient.getActiveTimeline().filterPendingReplaceTimeline().getInstants() + .filter(instant -> isPendingClusteringInstant(metaClient, instant)) + .collect(Collectors.toList()); + } + + public static boolean isPendingClusteringInstant(HoodieTableMetaClient metaClient, HoodieInstant instant) { + return getClusteringPlan(metaClient, instant).isPresent(); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CollectionUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CollectionUtils.java index 6a4efca295efe..1a3d053e23acd 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/CollectionUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CollectionUtils.java @@ -62,7 +62,7 @@ public static T[] append(T[] array, T elem) { /** - * Combines provided {@link List}s into one + * Combines provided {@link List}s into one, returning new instance of {@link ArrayList} */ public static List combine(List one, List another) { ArrayList combined = new ArrayList<>(one.size() + another.size()); @@ -71,6 +71,19 @@ public static List combine(List one, List another) { return combined; } + /** + * Combines provided {@link Map}s into one, returning new instance of {@link HashMap}. + * + * NOTE: That values associated with overlapping keys from the second map, will override + * values from the first one + */ + public static Map combine(Map one, Map another) { + Map combined = new HashMap<>(one.size() + another.size()); + combined.putAll(one); + combined.putAll(another); + return combined; + } + /** * Returns difference b/w {@code one} {@link Set} of elements and {@code another} */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java index 3ec96be207330..0b82f091402a0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java @@ -25,6 +25,11 @@ */ public interface Functions { + static Runnable noop() { + return () -> { + }; + } + /** * A function which has not any parameter. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java index 42d6057968f97..193bf5315fd01 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Option.java @@ -108,14 +108,31 @@ public Option map(Function mapper) { } } + /** + * Returns this {@link Option} if not empty, otherwise evaluates the provided supplier + * and returns the alternative + */ + public Option or(Supplier> other) { + return val != null ? this : other.get(); + } + + /** + * Identical to {@code Optional.orElse} + */ public T orElse(T other) { return val != null ? val : other; } + /** + * Identical to {@code Optional.orElseGet} + */ public T orElseGet(Supplier other) { return val != null ? val : other.get(); } + /** + * Identical to {@code Optional.orElseThrow} + */ public T orElseThrow(Supplier exceptionSupplier) throws X { if (val != null) { return val; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java index 4b3caa756a65f..d9ceeeee40f63 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcReaderIterator.java @@ -29,19 +29,18 @@ import org.apache.orc.TypeDescription; import java.io.IOException; -import java.util.Iterator; /** * This class wraps a ORC reader and provides an iterator based api to read from an ORC file. */ -public class OrcReaderIterator implements Iterator { +public class OrcReaderIterator implements ClosableIterator { private final RecordReader recordReader; private final Schema avroSchema; - List fieldNames; - List orcFieldTypes; - Schema[] avroFieldSchemas; - private VectorizedRowBatch batch; + private final List fieldNames; + private final List orcFieldTypes; + private final Schema[] avroFieldSchemas; + private final VectorizedRowBatch batch; private int rowInBatch; private T next; @@ -52,7 +51,7 @@ public OrcReaderIterator(RecordReader recordReader, Schema schema, TypeDescripti this.orcFieldTypes = orcSchema.getChildren(); this.avroFieldSchemas = fieldNames.stream() .map(fieldName -> avroSchema.getField(fieldName).schema()) - .toArray(size -> new Schema[size]); + .toArray(Schema[]::new); this.batch = orcSchema.createRowBatch(); this.rowInBatch = 0; } @@ -115,4 +114,9 @@ private GenericData.Record readRecordFromBatch() throws IOException { rowInBatch++; return record; } + + @Override + public void close() { + FileIOUtils.closeQuietly(this.recordReader); + } } \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java index e418043fe0ecd..88c28d75204a7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/OrcUtils.java @@ -19,7 +19,9 @@ package org.apache.hudi.common.util; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -54,29 +56,23 @@ public class OrcUtils extends BaseFileUtils { /** - * Fetch {@link HoodieKey}s from the given ORC file. + * Provides a closable iterator for reading the given ORC file. * - * @param filePath The ORC file path. * @param configuration configuration to build fs object - * @return {@link List} of {@link HoodieKey}s fetched from the ORC file + * @param filePath The ORC file path + * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the ORC file */ @Override - public List fetchRecordKeyPartitionPath(Configuration configuration, Path filePath) { - List hoodieKeys = new ArrayList<>(); + public ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath) { try { - if (!filePath.getFileSystem(configuration).exists(filePath)) { - return new ArrayList<>(); - } - Configuration conf = new Configuration(configuration); conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf()); Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf)); Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema(); TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(readSchema); - List fieldNames = orcSchema.getFieldNames(); - VectorizedRowBatch batch = orcSchema.createRowBatch(); RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema)); + List fieldNames = orcSchema.getFieldNames(); // column indices for the RECORD_KEY_METADATA_FIELD, PARTITION_PATH_METADATA_FIELD fields int keyCol = -1; @@ -92,24 +88,43 @@ public List fetchRecordKeyPartitionPath(Configuration configuration, if (keyCol == -1 || partitionCol == -1) { throw new HoodieException(String.format("Couldn't find row keys or partition path in %s.", filePath)); } - while (recordReader.nextBatch(batch)) { - BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[keyCol]; - BytesColumnVector partitionPaths = (BytesColumnVector) batch.cols[partitionCol]; - for (int i = 0; i < batch.size; i++) { - String rowKey = rowKeys.toString(i); - String partitionPath = partitionPaths.toString(i); - hoodieKeys.add(new HoodieKey(rowKey, partitionPath)); - } + return new OrcReaderIterator<>(recordReader, readSchema, orcSchema); + } catch (IOException e) { + throw new HoodieIOException("Failed to open reader from ORC file:" + filePath, e); + } + } + + /** + * Fetch {@link HoodieKey}s from the given ORC file. + * + * @param filePath The ORC file path. + * @param configuration configuration to build fs object + * @return {@link List} of {@link HoodieKey}s fetched from the ORC file + */ + @Override + public List fetchHoodieKeys(Configuration configuration, Path filePath) { + try { + if (!filePath.getFileSystem(configuration).exists(filePath)) { + return Collections.emptyList(); } } catch (IOException e) { throw new HoodieIOException("Failed to read from ORC file:" + filePath, e); } + List hoodieKeys = new ArrayList<>(); + try (ClosableIterator iterator = getHoodieKeyIterator(configuration, filePath, Option.empty())) { + iterator.forEachRemaining(hoodieKeys::add); + } return hoodieKeys; } @Override - public List fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option keyGeneratorOpt) { - throw new HoodieIOException("UnsupportedOperation : Disabling meta fields not yet supported for Orc"); + public List fetchHoodieKeys(Configuration configuration, Path filePath, Option keyGeneratorOpt) { + throw new UnsupportedOperationException("Custom key generator is not supported yet"); + } + + @Override + public ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath, Option keyGeneratorOpt) { + throw new UnsupportedOperationException("Custom key generator is not supported yet"); } /** @@ -118,8 +133,7 @@ public List fetchRecordKeyPartitionPath(Configuration configuration, @Override public List readAvroRecords(Configuration configuration, Path filePath) { Schema avroSchema; - try { - Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration)); + try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration))) { avroSchema = AvroOrcUtils.createAvroSchema(reader.getSchema()); } catch (IOException io) { throw new HoodieIOException("Unable to read Avro records from an ORC file:" + filePath, io); @@ -133,14 +147,14 @@ public List readAvroRecords(Configuration configuration, Path fil @Override public List readAvroRecords(Configuration configuration, Path filePath, Schema avroSchema) { List records = new ArrayList<>(); - try { - Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration)); + try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(configuration))) { TypeDescription orcSchema = reader.getSchema(); - RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema)); - OrcReaderIterator iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema); - while (iterator.hasNext()) { - GenericRecord record = iterator.next(); - records.add(record); + try (RecordReader recordReader = reader.rows(new Options(configuration).schema(orcSchema))) { + OrcReaderIterator iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema); + while (iterator.hasNext()) { + GenericRecord record = iterator.next(); + records.add(record); + } } } catch (IOException io) { throw new HoodieIOException("Unable to create an ORC reader for ORC file:" + filePath, io); @@ -160,35 +174,35 @@ public List readAvroRecords(Configuration configuration, Path fil @Override public Set filterRowKeys(Configuration conf, Path filePath, Set filter) throws HoodieIOException { - try { - Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf)); - Set filteredRowKeys = new HashSet<>(); + try (Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));) { TypeDescription schema = reader.getSchema(); - List fieldNames = schema.getFieldNames(); - VectorizedRowBatch batch = schema.createRowBatch(); - RecordReader recordReader = reader.rows(new Options(conf).schema(schema)); + try (RecordReader recordReader = reader.rows(new Options(conf).schema(schema))) { + Set filteredRowKeys = new HashSet<>(); + List fieldNames = schema.getFieldNames(); + VectorizedRowBatch batch = schema.createRowBatch(); - // column index for the RECORD_KEY_METADATA_FIELD field - int colIndex = -1; - for (int i = 0; i < fieldNames.size(); i++) { - if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) { - colIndex = i; - break; + // column index for the RECORD_KEY_METADATA_FIELD field + int colIndex = -1; + for (int i = 0; i < fieldNames.size(); i++) { + if (fieldNames.get(i).equals(HoodieRecord.RECORD_KEY_METADATA_FIELD)) { + colIndex = i; + break; + } } - } - if (colIndex == -1) { - throw new HoodieException(String.format("Couldn't find row keys in %s.", filePath)); - } - while (recordReader.nextBatch(batch)) { - BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[colIndex]; - for (int i = 0; i < batch.size; i++) { - String rowKey = rowKeys.toString(i); - if (filter.isEmpty() || filter.contains(rowKey)) { - filteredRowKeys.add(rowKey); + if (colIndex == -1) { + throw new HoodieException(String.format("Couldn't find row keys in %s.", filePath)); + } + while (recordReader.nextBatch(batch)) { + BytesColumnVector rowKeys = (BytesColumnVector) batch.cols[colIndex]; + for (int i = 0; i < batch.size; i++) { + String rowKey = rowKeys.toString(i); + if (filter.isEmpty() || filter.contains(rowKey)) { + filteredRowKeys.add(rowKey); + } } } + return filteredRowKeys; } - return filteredRowKeys; } catch (IOException io) { throw new HoodieIOException("Unable to read row keys for ORC file:" + filePath, io); } @@ -197,8 +211,7 @@ public Set filterRowKeys(Configuration conf, Path filePath, Set @Override public Map readFooter(Configuration conf, boolean required, Path orcFilePath, String... footerNames) { - try { - Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf)); + try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) { Map footerVals = new HashMap<>(); List metadataItemList = reader.getFileTail().getFooter().getMetadataList(); Map metadata = metadataItemList.stream().collect(Collectors.toMap( @@ -220,10 +233,16 @@ public Map readFooter(Configuration conf, boolean required, @Override public Schema readAvroSchema(Configuration conf, Path orcFilePath) { - try { - Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf)); - TypeDescription orcSchema = reader.getSchema(); - return AvroOrcUtils.createAvroSchema(orcSchema); + try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) { + if (reader.hasMetadataValue("orc.avro.schema")) { + ByteBuffer metadataValue = reader.getMetadataValue("orc.avro.schema"); + byte[] bytes = new byte[metadataValue.remaining()]; + metadataValue.get(bytes); + return new Schema.Parser().parse(new String(bytes)); + } else { + TypeDescription orcSchema = reader.getSchema(); + return AvroOrcUtils.createAvroSchema(orcSchema); + } } catch (IOException io) { throw new HoodieIOException("Unable to get Avro schema for ORC file:" + orcFilePath, io); } @@ -231,8 +250,7 @@ public Schema readAvroSchema(Configuration conf, Path orcFilePath) { @Override public long getRowCount(Configuration conf, Path orcFilePath) { - try { - Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf)); + try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) { return reader.getNumberOfRows(); } catch (IOException io) { throw new HoodieIOException("Unable to get row count for ORC file:" + orcFilePath, io); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java index 5970e02d6799a..03bd471b606f1 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetReaderIterator.java @@ -24,13 +24,12 @@ import org.apache.parquet.hadoop.ParquetReader; import java.io.IOException; -import java.util.Iterator; /** * This class wraps a parquet reader and provides an iterator based api to read from a parquet file. This is used in * {@link BoundedInMemoryQueue} */ -public class ParquetReaderIterator implements Iterator { +public class ParquetReaderIterator implements ClosableIterator { // Parquet reader for an existing parquet file private final ParquetReader parquetReader; @@ -73,7 +72,11 @@ public T next() { } } - public void close() throws IOException { - parquetReader.close(); + public void close() { + try { + parquetReader.close(); + } catch (IOException e) { + throw new HoodieException("Exception while closing the parquet reader", e); + } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index 136206150cbb7..e74f4f77703d0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -18,10 +18,6 @@ package org.apache.hudi.common.util; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieColumnRangeMetadata; @@ -30,6 +26,13 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.MetadataNotFoundException; import org.apache.hudi.keygen.BaseKeyGenerator; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.avro.AvroSchemaConverter; @@ -37,14 +40,17 @@ import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.DecimalMetadata; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import javax.annotation.Nonnull; + import java.io.IOException; import java.math.BigDecimal; +import java.math.BigInteger; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -60,6 +66,8 @@ */ public class ParquetUtils extends BaseFileUtils { + private static final Logger LOG = LogManager.getLogger(ParquetUtils.class); + /** * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will * return all the rowkeys. @@ -74,6 +82,17 @@ public Set filterRowKeys(Configuration configuration, Path filePath, Set return filterParquetRowKeys(configuration, filePath, filter, HoodieAvroUtils.getRecordKeySchema()); } + public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { + ParquetMetadata footer; + try { + // TODO(vc): Should we use the parallel reading version here? + footer = ParquetFileReader.readFooter(FSUtils.getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); + } catch (IOException e) { + throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); + } + return footer; + } + /** * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will * return all the rowkeys. @@ -122,12 +141,26 @@ private static Set filterParquetRowKeys(Configuration configuration, Pat * @return {@link List} of {@link HoodieKey}s fetched from the parquet file */ @Override - public List fetchRecordKeyPartitionPath(Configuration configuration, Path filePath) { - return fetchRecordKeyPartitionPathInternal(configuration, filePath, Option.empty()); + public List fetchHoodieKeys(Configuration configuration, Path filePath) { + return fetchHoodieKeys(configuration, filePath, Option.empty()); } - private List fetchRecordKeyPartitionPathInternal(Configuration configuration, Path filePath, Option keyGeneratorOpt) { - List hoodieKeys = new ArrayList<>(); + @Override + public ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath) { + return getHoodieKeyIterator(configuration, filePath, Option.empty()); + } + + /** + * Returns a closable iterator for reading the given parquet file. + * + * @param configuration configuration to build fs object + * @param filePath The parquet file path + * @param keyGeneratorOpt instance of KeyGenerator + * + * @return {@link ClosableIterator} of {@link HoodieKey}s for reading the parquet file + */ + @Override + public ClosableIterator getHoodieKeyIterator(Configuration configuration, Path filePath, Option keyGeneratorOpt) { try { Configuration conf = new Configuration(configuration); conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf()); @@ -140,27 +173,11 @@ private List fetchRecordKeyPartitionPathInternal(Configuration config .orElse(HoodieAvroUtils.getRecordKeyPartitionPathSchema()); AvroReadSupport.setAvroReadSchema(conf, readSchema); AvroReadSupport.setRequestedProjection(conf, readSchema); - ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build(); - Object obj = reader.read(); - while (obj != null) { - if (obj instanceof GenericRecord) { - String recordKey = null; - String partitionPath = null; - if (keyGeneratorOpt.isPresent()) { - recordKey = keyGeneratorOpt.get().getRecordKey((GenericRecord) obj); - partitionPath = keyGeneratorOpt.get().getPartitionPath((GenericRecord) obj); - } else { - recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); - partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - } - hoodieKeys.add(new HoodieKey(recordKey, partitionPath)); - obj = reader.read(); - } - } + ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build(); + return HoodieKeyIterator.getInstance(new ParquetReaderIterator<>(reader), keyGeneratorOpt); } catch (IOException e) { throw new HoodieIOException("Failed to read from Parquet file " + filePath, e); } - return hoodieKeys; } /** @@ -168,23 +185,16 @@ private List fetchRecordKeyPartitionPathInternal(Configuration config * * @param configuration configuration to build fs object * @param filePath The parquet file path. - * @param keyGeneratorOpt + * @param keyGeneratorOpt instance of KeyGenerator. * @return {@link List} of {@link HoodieKey}s fetched from the parquet file */ @Override - public List fetchRecordKeyPartitionPath(Configuration configuration, Path filePath, Option keyGeneratorOpt) { - return fetchRecordKeyPartitionPathInternal(configuration, filePath, keyGeneratorOpt); - } - - public ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { - ParquetMetadata footer; - try { - // TODO(vc): Should we use the parallel reading version here? - footer = ParquetFileReader.readFooter(FSUtils.getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); - } catch (IOException e) { - throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); + public List fetchHoodieKeys(Configuration configuration, Path filePath, Option keyGeneratorOpt) { + List hoodieKeys = new ArrayList<>(); + try (ClosableIterator iterator = getHoodieKeyIterator(configuration, filePath, keyGeneratorOpt)) { + iterator.forEachRemaining(hoodieKeys::add); + return hoodieKeys; } - return footer; } /** @@ -222,10 +232,8 @@ public Schema readAvroSchema(Configuration configuration, Path parquetFilePath) */ @Override public List readAvroRecords(Configuration configuration, Path filePath) { - ParquetReader reader = null; List records = new ArrayList<>(); - try { - reader = AvroParquetReader.builder(filePath).withConf(configuration).build(); + try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(configuration).build()) { Object obj = reader.read(); while (obj != null) { if (obj instanceof GenericRecord) { @@ -236,14 +244,6 @@ public List readAvroRecords(Configuration configuration, Path fil } catch (IOException e) { throw new HoodieIOException("Failed to read avro records from Parquet " + filePath, e); - } finally { - if (reader != null) { - try { - reader.close(); - } catch (IOException e) { - // ignore - } - } } return records; } @@ -298,18 +298,20 @@ public List> readRangeFromParquetMetadata( Map>> columnToStatsListMap = metadata.getBlocks().stream().sequential() .flatMap(blockMetaData -> blockMetaData.getColumns().stream() .filter(f -> cols.contains(f.getPath().toDotString())) - .map(columnChunkMetaData -> - new HoodieColumnRangeMetadata( - parquetFilePath.getName(), - columnChunkMetaData.getPath().toDotString(), - convertToNativeJavaType( - columnChunkMetaData.getPrimitiveType(), - columnChunkMetaData.getStatistics().genericGetMin()), - convertToNativeJavaType( - columnChunkMetaData.getPrimitiveType(), - columnChunkMetaData.getStatistics().genericGetMax()), - columnChunkMetaData.getStatistics().getNumNulls(), - columnChunkMetaData.getPrimitiveType().stringifier())) + .map(columnChunkMetaData -> + new HoodieColumnRangeMetadata( + parquetFilePath.getName(), + columnChunkMetaData.getPath().toDotString(), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + columnChunkMetaData.getStatistics().genericGetMin()), + convertToNativeJavaType( + columnChunkMetaData.getPrimitiveType(), + columnChunkMetaData.getStatistics().genericGetMax()), + columnChunkMetaData.getStatistics().getNumNulls(), + columnChunkMetaData.getValueCount(), + columnChunkMetaData.getTotalSize(), + columnChunkMetaData.getTotalUncompressedSize())) ).collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName)); // Combine those into file-level statistics @@ -354,30 +356,117 @@ private > HoodieColumnRangeMetadata combineRanges( maxValue = one.getMaxValue().compareTo(another.getMaxValue()) < 0 ? another.getMaxValue() : one.getMaxValue(); } else if (one.getMaxValue() == null) { maxValue = another.getMaxValue(); - } else { + } else { maxValue = one.getMaxValue(); } return new HoodieColumnRangeMetadata( one.getFilePath(), - one.getColumnName(), minValue, maxValue, one.getNumNulls() + another.getNumNulls(), one.getStringifier()); + one.getColumnName(), minValue, maxValue, + one.getNullCount() + another.getNullCount(), + one.getValueCount() + another.getValueCount(), + one.getTotalSize() + another.getTotalSize(), + one.getTotalUncompressedSize() + another.getTotalUncompressedSize()); } private static Comparable convertToNativeJavaType(PrimitiveType primitiveType, Comparable val) { if (primitiveType.getOriginalType() == OriginalType.DECIMAL) { - DecimalMetadata decimalMetadata = primitiveType.getDecimalMetadata(); - return BigDecimal.valueOf((Integer) val, decimalMetadata.getScale()); + return extractDecimal(val, primitiveType.getDecimalMetadata()); } else if (primitiveType.getOriginalType() == OriginalType.DATE) { // NOTE: This is a workaround to address race-condition in using // {@code SimpleDataFormat} concurrently (w/in {@code DateStringifier}) // TODO cleanup after Parquet upgrade to 1.12 synchronized (primitiveType.stringifier()) { + // Date logical type is implemented as a signed INT32 + // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md return java.sql.Date.valueOf( primitiveType.stringifier().stringify((Integer) val) ); } + } else if (primitiveType.getOriginalType() == OriginalType.UTF8) { + // NOTE: UTF8 type designates a byte array that should be interpreted as a + // UTF-8 encoded character string + // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + return ((Binary) val).toStringUsingUTF8(); + } else if (primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY) { + // NOTE: `getBytes` access makes a copy of the underlying byte buffer + return ((Binary) val).toByteBuffer(); } return val; } + + @Nonnull + private static BigDecimal extractDecimal(Object val, DecimalMetadata decimalMetadata) { + // In Parquet, Decimal could be represented as either of + // 1. INT32 (for 1 <= precision <= 9) + // 2. INT64 (for 1 <= precision <= 18) + // 3. FIXED_LEN_BYTE_ARRAY (precision is limited by the array size. Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits) + // 4. BINARY (precision is not limited) + // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL + int scale = decimalMetadata.getScale(); + if (val == null) { + return null; + } else if (val instanceof Integer) { + return BigDecimal.valueOf((Integer) val, scale); + } else if (val instanceof Long) { + return BigDecimal.valueOf((Long) val, scale); + } else if (val instanceof Binary) { + // NOTE: Unscaled number is stored in BE format (most significant byte is 0th) + return new BigDecimal(new BigInteger(((Binary) val).getBytesUnsafe()), scale); + } else { + throw new UnsupportedOperationException(String.format("Unsupported value type (%s)", val.getClass().getName())); + } + } + + // ------------------------------------------------------------------------- + // Inner Class + // ------------------------------------------------------------------------- + + /** + * An iterator that can apply the given function {@code func} to transform records + * from the underneath record iterator to hoodie keys. + */ + private static class HoodieKeyIterator implements ClosableIterator { + private final ClosableIterator nestedItr; + private final Function func; + + public static HoodieKeyIterator getInstance(ClosableIterator nestedItr, Option keyGenerator) { + return new HoodieKeyIterator(nestedItr, keyGenerator); + } + + private HoodieKeyIterator(ClosableIterator nestedItr, Option keyGenerator) { + this.nestedItr = nestedItr; + if (keyGenerator.isPresent()) { + this.func = retVal -> { + String recordKey = keyGenerator.get().getRecordKey(retVal); + String partitionPath = keyGenerator.get().getPartitionPath(retVal); + return new HoodieKey(recordKey, partitionPath); + }; + } else { + this.func = retVal -> { + String recordKey = retVal.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partitionPath = retVal.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + return new HoodieKey(recordKey, partitionPath); + }; + } + } + + @Override + public void close() { + if (this.nestedItr != null) { + this.nestedItr.close(); + } + } + + @Override + public boolean hasNext() { + return this.nestedItr.hasNext(); + } + + @Override + public HoodieKey next() { + return this.func.apply(this.nestedItr.next()); + } + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/RetryHelper.java b/hudi-common/src/main/java/org/apache/hudi/common/util/RetryHelper.java new file mode 100644 index 0000000000000..067c5ee40dad7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/RetryHelper.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; + +public class RetryHelper { + private static final Logger LOG = LogManager.getLogger(RetryHelper.class); + private CheckedFunction func; + private int num; + private long maxIntervalTime; + private long initialIntervalTime = 100L; + private String taskInfo = "N/A"; + private List> retryExceptionsClasses; + + public RetryHelper() { + } + + public RetryHelper(long maxRetryIntervalMs, int maxRetryNumbers, long initialRetryIntervalMs, String retryExceptions) { + this.num = maxRetryNumbers; + this.initialIntervalTime = initialRetryIntervalMs; + this.maxIntervalTime = maxRetryIntervalMs; + if (StringUtils.isNullOrEmpty(retryExceptions)) { + this.retryExceptionsClasses = new ArrayList<>(); + } else { + this.retryExceptionsClasses = Arrays.stream(retryExceptions.split(",")) + .map(exception -> (Exception) ReflectionUtils.loadClass(exception, "")) + .map(Exception::getClass) + .collect(Collectors.toList()); + } + } + + public RetryHelper(String taskInfo) { + this.taskInfo = taskInfo; + } + + public RetryHelper tryWith(CheckedFunction func) { + this.func = func; + return this; + } + + public T start() throws IOException { + int retries = 0; + T functionResult = null; + + while (true) { + long waitTime = Math.min(getWaitTimeExp(retries), maxIntervalTime); + try { + functionResult = func.get(); + break; + } catch (IOException | RuntimeException e) { + if (!checkIfExceptionInRetryList(e)) { + throw e; + } + if (retries++ >= num) { + LOG.error("Still failed to " + taskInfo + " after retried " + num + " times.", e); + throw e; + } + LOG.warn("Catch Exception " + taskInfo + ", will retry after " + waitTime + " ms.", e); + try { + Thread.sleep(waitTime); + } catch (InterruptedException ex) { + // ignore InterruptedException here + } + } + } + + if (retries > 0) { + LOG.info("Success to " + taskInfo + " after retried " + retries + " times."); + } + return functionResult; + } + + private boolean checkIfExceptionInRetryList(Exception e) { + boolean inRetryList = false; + + // if users didn't set hoodie.filesystem.operation.retry.exceptions + // we will retry all the IOException and RuntimeException + if (retryExceptionsClasses.isEmpty()) { + return true; + } + + for (Class clazz : retryExceptionsClasses) { + if (clazz.isInstance(e)) { + inRetryList = true; + break; + } + } + return inRetryList; + } + + private long getWaitTimeExp(int retryCount) { + Random random = new Random(); + if (0 == retryCount) { + return initialIntervalTime; + } + + return (long) Math.pow(2, retryCount) * initialIntervalTime + random.nextInt(100); + } + + @FunctionalInterface + public interface CheckedFunction { + T get() throws IOException; + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java index 934b5b5f616c6..9ded415438a86 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SpillableMapUtils.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.util; import org.apache.hudi.common.fs.SizeAwareDataOutputStream; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; @@ -32,9 +33,9 @@ import java.io.IOException; import java.io.RandomAccessFile; -import java.util.zip.CRC32; import static org.apache.hudi.avro.HoodieAvroUtils.getNullableValAsString; +import static org.apache.hudi.common.util.BinaryUtil.generateChecksum; /** * A utility class supports spillable map. @@ -94,15 +95,6 @@ private static long spill(SizeAwareDataOutputStream outputStream, FileEntry file return outputStream.getSize(); } - /** - * Generate a checksum for a given set of bytes. - */ - public static long generateChecksum(byte[] data) { - CRC32 crc = new CRC32(); - crc.update(data); - return crc.getValue(); - } - /** * Compute a bytes representation of the payload by serializing the contents This is used to estimate the size of the * payload (either in memory or when written to disk). @@ -144,7 +136,7 @@ public static R convertToHoodieRecordPayload(GenericRecord record, String pa Object preCombineVal = getPreCombineVal(record, preCombineField); HoodieOperation operation = withOperationField ? HoodieOperation.fromName(getNullableValAsString(record, HoodieRecord.OPERATION_METADATA_FIELD)) : null; - HoodieRecord hoodieRecord = new HoodieRecord<>(new HoodieKey(recKey, partitionPath), + HoodieRecord hoodieRecord = new HoodieAvroRecord<>(new HoodieKey(recKey, partitionPath), ReflectionUtils.loadPayload(payloadClazz, new Object[]{record, preCombineVal}, GenericRecord.class, Comparable.class), operation); @@ -170,7 +162,7 @@ private static Object getPreCombineVal(GenericRecord rec, String preCombineField * Utility method to convert bytes to HoodieRecord using schema and payload class. */ public static R generateEmptyPayload(String recKey, String partitionPath, String payloadClazz) { - HoodieRecord hoodieRecord = new HoodieRecord<>(new HoodieKey(recKey, partitionPath), + HoodieRecord hoodieRecord = new HoodieAvroRecord<>(new HoodieKey(recKey, partitionPath), ReflectionUtils.loadPayload(payloadClazz, new Object[] {Option.empty()}, Option.class)); return (R) hoodieRecord; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java index 289901df81861..9fb0b20e74f2c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/BitCaskDiskMap.java @@ -56,6 +56,8 @@ import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; +import static org.apache.hudi.common.util.BinaryUtil.generateChecksum; + /** * This class provides a disk spillable only map implementation. All of the data is currenly written to one file, * without any rollover support. It uses the following : 1) An in-memory map that tracks the key-> latest ValueMetadata. @@ -223,7 +225,7 @@ private synchronized R put(T key, R value, boolean flush) { new BitCaskDiskMap.ValueMetadata(this.filePath, valueSize, filePosition.get(), timestamp)); byte[] serializedKey = SerializationUtils.serialize(key); filePosition - .set(SpillableMapUtils.spillToDisk(writeOnlyFileHandle, new FileEntry(SpillableMapUtils.generateChecksum(val), + .set(SpillableMapUtils.spillToDisk(writeOnlyFileHandle, new FileEntry(generateChecksum(val), serializedKey.length, valueSize, serializedKey, val, timestamp))); if (flush) { flushToDisk(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnIndexID.java similarity index 80% rename from hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java rename to hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnIndexID.java index be4db44ecd961..92e60b30a311f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnIndexID.java @@ -24,14 +24,21 @@ /** * A stateful Hoodie object ID representing any table column. */ -public class ColumnID extends HoodieID { +public class ColumnIndexID extends HoodieIndexID { private static final Type TYPE = Type.COLUMN; - private static final HashID.Size ID_COLUMN_HASH_SIZE = HashID.Size.BITS_64; + public static final HashID.Size ID_COLUMN_HASH_SIZE = HashID.Size.BITS_64; + private final String column; private final byte[] hash; - public ColumnID(final String message) { - this.hash = HashID.hash(message, ID_COLUMN_HASH_SIZE); + public ColumnIndexID(final String column) { + this.column = column; + this.hash = HashID.hash(column, ID_COLUMN_HASH_SIZE); + } + + @Override + public String getName() { + return column; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileIndexID.java similarity index 84% rename from hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java rename to hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileIndexID.java index 0cb73c5abf9a8..3f9616908bb39 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileIndexID.java @@ -24,14 +24,21 @@ /** * Hoodie object ID representing any file. */ -public class FileID extends HoodieID { +public class FileIndexID extends HoodieIndexID { private static final Type TYPE = Type.FILE; private static final HashID.Size ID_FILE_HASH_SIZE = HashID.Size.BITS_128; + private final String fileName; private final byte[] hash; - public FileID(final String message) { - this.hash = HashID.hash(message, ID_FILE_HASH_SIZE); + public FileIndexID(final String fileName) { + this.fileName = fileName; + this.hash = HashID.hash(fileName, ID_FILE_HASH_SIZE); + } + + @Override + public String getName() { + return fileName; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieIndexID.java similarity index 89% rename from hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java rename to hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieIndexID.java index e08e254b0a215..139efd17ed0ae 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieIndexID.java @@ -24,9 +24,10 @@ import java.io.Serializable; /** - * A serializable ID that can be used to identify any Hoodie table fields and resources. + * A serializable ID that can be used to identify any Hoodie table fields and + * resources in the on-disk index. */ -public abstract class HoodieID implements Serializable { +public abstract class HoodieIndexID implements Serializable { private static final long serialVersionUID = 1L; @@ -50,6 +51,13 @@ public String toString() { } } + /** + * Get the resource name for which this index id is generated. + * + * @return The resource name + */ + public abstract String getName(); + /** * Get the number of bits representing this ID in memory. *

@@ -74,7 +82,7 @@ public String toString() { public abstract String toString(); /** - * + * Get the Base64 encoded version of the ID. */ public String asBase64EncodedString() { throw new HoodieNotSupportedException("Unsupported hash for " + getType()); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionIndexID.java similarity index 83% rename from hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java rename to hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionIndexID.java index f31159faa2a2f..0fbae27b80de8 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionIndexID.java @@ -24,14 +24,21 @@ /** * Hoodie object ID representing any partition. */ -public class PartitionID extends HoodieID { +public class PartitionIndexID extends HoodieIndexID { private static final Type TYPE = Type.PARTITION; private static final HashID.Size ID_PARTITION_HASH_SIZE = HashID.Size.BITS_64; + private final String partition; private final byte[] hash; - public PartitionID(final String message) { - this.hash = HashID.hash(message, ID_PARTITION_HASH_SIZE); + public PartitionIndexID(final String partition) { + this.partition = partition; + this.hash = HashID.hash(partition, ID_PARTITION_HASH_SIZE); + } + + @Override + public String getName() { + return partition; } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/io/ByteBufferBackedInputStream.java b/hudi-common/src/main/java/org/apache/hudi/common/util/io/ByteBufferBackedInputStream.java new file mode 100644 index 0000000000000..0f96d1011a3f0 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/io/ByteBufferBackedInputStream.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.io; + +import javax.annotation.Nonnull; +import java.io.InputStream; +import java.nio.ByteBuffer; + +/** + * Instance of {@link InputStream} backed by {@link ByteBuffer}, implementing following + * functionality (on top of what's required by {@link InputStream}) + * + *

    + *
  1. Seeking: enables random access by allowing to seek to an arbitrary position w/in the stream
  2. + *
  3. (Thread-safe) Copying: enables to copy from the underlying buffer not modifying the state of the stream
  4. + *
+ * + * NOTE: Generally methods of this class are NOT thread-safe, unless specified otherwise + */ +public class ByteBufferBackedInputStream extends InputStream { + + private final ByteBuffer buffer; + private final int bufferOffset; + + public ByteBufferBackedInputStream(ByteBuffer buf) { + this.buffer = buf.duplicate(); + // We're marking current buffer position, so that we will be able + // to reset it later on appropriately (to support seek operations) + this.buffer.mark(); + this.bufferOffset = buffer.position(); + } + + public ByteBufferBackedInputStream(byte[] array) { + this(array, 0, array.length); + } + + public ByteBufferBackedInputStream(byte[] array, int offset, int length) { + this(ByteBuffer.wrap(array, offset, length)); + } + + @Override + public int read() { + if (!buffer.hasRemaining()) { + throw new IllegalArgumentException("Reading past backed buffer boundary"); + } + return buffer.get() & 0xFF; + } + + @Override + public int read(@Nonnull byte[] bytes, int offset, int length) { + if (!buffer.hasRemaining()) { + throw new IllegalArgumentException("Reading past backed buffer boundary"); + } + // Determine total number of bytes available to read + int available = Math.min(length, buffer.remaining()); + // Copy bytes into the target buffer + buffer.get(bytes, offset, available); + return available; + } + + /** + * Returns current position of the stream + */ + public int getPosition() { + return buffer.position() - bufferOffset; + } + + /** + * Seeks to a position w/in the stream + * + * NOTE: Position is relative to the start of the stream (ie its absolute w/in this stream), + * with following invariant being assumed: + *

0 <= pos <= length (of the stream)

+ * + * This method is NOT thread-safe + * + * @param pos target position to seek to w/in the holding buffer + */ + public void seek(long pos) { + buffer.reset(); // to mark + int offset = buffer.position(); + // NOTE: That the new pos is still relative to buffer's offset + int newPos = offset + (int) pos; + if (newPos > buffer.limit() || newPos < offset) { + throw new IllegalArgumentException( + String.format("Can't seek past the backing buffer (limit %d, offset %d, new %d)", buffer.limit(), offset, newPos) + ); + } + + buffer.position(newPos); + } + + /** + * Copies at most {@code length} bytes starting from position {@code pos} into the target + * buffer with provided {@code offset}. Returns number of bytes copied from the backing buffer + * + * NOTE: This does not change the current position of the stream and is thread-safe + * + * @param pos absolute position w/in stream to read from + * @param targetBuffer target buffer to copy into + * @param offset target buffer offset to copy at + * @param length length of the sequence to copy + * @return number of bytes copied + */ + public int copyFrom(long pos, byte[] targetBuffer, int offset, int length) { + int bufferPos = bufferOffset + (int) pos; + if (bufferPos > buffer.limit()) { + throw new IllegalArgumentException( + String.format("Can't read past the backing buffer boundary (offset %d, length %d)", pos, buffer.limit() - bufferOffset) + ); + } else if (length > targetBuffer.length) { + throw new IllegalArgumentException( + String.format("Target buffer is too small (length %d, buffer size %d)", length, targetBuffer.length) + ); + } + // Determine total number of bytes available to read + int available = Math.min(length, buffer.limit() - bufferPos); + // Get current buffer position in the backing array + System.arraycopy(buffer.array(), bufferPos, targetBuffer, offset, available); + return available; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java index 872837913b054..68b840a4794d6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/queue/BoundedInMemoryExecutor.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.util.queue; import org.apache.hudi.common.util.DefaultSizeEstimator; +import org.apache.hudi.common.util.Functions; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.SizeEstimator; import org.apache.hudi.exception.HoodieException; @@ -26,7 +27,8 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CountDownLatch; @@ -54,29 +56,35 @@ public class BoundedInMemoryExecutor { private final List> producers; // Consumer private final Option> consumer; + // pre-execute function to implement environment specific behavior before executors (producers/consumer) run + private final Runnable preExecuteRunnable; + + public BoundedInMemoryExecutor(final long bufferLimitInBytes, final Iterator inputItr, + BoundedInMemoryQueueConsumer consumer, Function transformFunction, Runnable preExecuteRunnable) { + this(bufferLimitInBytes, new IteratorBasedQueueProducer<>(inputItr), Option.of(consumer), transformFunction, preExecuteRunnable); + } public BoundedInMemoryExecutor(final long bufferLimitInBytes, BoundedInMemoryQueueProducer producer, Option> consumer, final Function transformFunction) { - this(bufferLimitInBytes, Arrays.asList(producer), consumer, transformFunction, new DefaultSizeEstimator<>()); + this(bufferLimitInBytes, producer, consumer, transformFunction, Functions.noop()); + } + + public BoundedInMemoryExecutor(final long bufferLimitInBytes, BoundedInMemoryQueueProducer producer, + Option> consumer, final Function transformFunction, Runnable preExecuteRunnable) { + this(bufferLimitInBytes, Collections.singletonList(producer), consumer, transformFunction, new DefaultSizeEstimator<>(), preExecuteRunnable); } public BoundedInMemoryExecutor(final long bufferLimitInBytes, List> producers, Option> consumer, final Function transformFunction, - final SizeEstimator sizeEstimator) { + final SizeEstimator sizeEstimator, Runnable preExecuteRunnable) { this.producers = producers; this.consumer = consumer; + this.preExecuteRunnable = preExecuteRunnable; // Ensure single thread for each producer thread and one for consumer this.executorService = Executors.newFixedThreadPool(producers.size() + 1); this.queue = new BoundedInMemoryQueue<>(bufferLimitInBytes, transformFunction, sizeEstimator); } - /** - * Callback to implement environment specific behavior before executors (producers/consumer) run. - */ - public void preExecute() { - // Do Nothing in general context - } - /** * Start all Producers. */ @@ -88,7 +96,7 @@ public ExecutorCompletionService startProducers() { producers.stream().map(producer -> { return completionService.submit(() -> { try { - preExecute(); + preExecuteRunnable.run(); producer.produce(queue); } catch (Throwable e) { LOG.error("error producing records", e); @@ -116,7 +124,7 @@ private Future startConsumer() { return consumer.map(consumer -> { return executorService.submit(() -> { LOG.info("starting consumer thread"); - preExecute(); + preExecuteRunnable.run(); try { E result = consumer.consume(queue); LOG.info("Queue Consumption is done; notifying producer threads"); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java rename to hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroParquetConfig.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java similarity index 100% rename from hudi-client/hudi-client-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java rename to hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieBaseParquetConfig.java diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java index fefe7eb7e5cc6..cb330b81432bf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieFileReader.java @@ -20,6 +20,8 @@ import java.io.IOException; import java.util.Iterator; +import java.util.List; +import java.util.Map; import java.util.Set; import org.apache.avro.Schema; @@ -27,7 +29,7 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.util.Option; -public interface HoodieFileReader { +public interface HoodieFileReader extends AutoCloseable { public String[] readMinMaxRecordKeys(); @@ -35,6 +37,10 @@ public interface HoodieFileReader { public Set filterRowKeys(Set candidateRowKeys); + default Map getRecordsByKeys(List rowKeys) throws IOException { + throw new UnsupportedOperationException(); + } + public Iterator getRecordIterator(Schema readerSchema) throws IOException; default Iterator getRecordIterator() throws IOException { diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java index e3e38eca86ca9..371da7675e992 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java @@ -18,17 +18,18 @@ package org.apache.hudi.io.storage; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashSet; +import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.IndexedRecord; @@ -49,11 +50,17 @@ import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.io.ByteBufferBackedInputStream; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; -public class HoodieHFileReader implements HoodieFileReader { +public class HoodieHFileReader implements HoodieFileReader { + private static final Logger LOG = LogManager.getLogger(HoodieHFileReader.class); private Path path; private Configuration conf; private HFile.Reader reader; @@ -63,6 +70,7 @@ public class HoodieHFileReader implements HoodieFileRea // key retrieval. private HFileScanner keyScanner; + public static final String KEY_FIELD_NAME = "key"; public static final String KEY_SCHEMA = "schema"; public static final String KEY_BLOOM_FILTER_META_BLOCK = "bloomFilter"; public static final String KEY_BLOOM_FILTER_TYPE_CODE = "bloomFilterTypeCode"; @@ -75,11 +83,11 @@ public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cac this.reader = HFile.createReader(FSUtils.getFs(path.toString(), configuration), path, cacheConfig, conf); } - public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig, FileSystem inlineFs) throws IOException { + public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig, FileSystem fs) throws IOException { this.conf = configuration; this.path = path; - this.fsDataInputStream = inlineFs.open(path); - this.reader = HFile.createReader(inlineFs, path, cacheConfig, configuration); + this.fsDataInputStream = fs.open(path); + this.reader = HFile.createReader(fs, path, cacheConfig, configuration); } public HoodieHFileReader(byte[] content) throws IOException { @@ -116,6 +124,13 @@ public Schema getSchema() { return schema; } + /** + * Sets up the writer schema explicitly. + */ + public void withSchema(Schema schema) { + this.schema = schema; + } + @Override public BloomFilter readBloomFilter() { Map fileInfo; @@ -131,35 +146,69 @@ public BloomFilter readBloomFilter() { } } + /** + * Filter keys by availability. + *

+ * Note: This method is performant when the caller passes in a sorted candidate keys. + * + * @param candidateRowKeys - Keys to check for the availability + * @return Subset of candidate keys that are available + */ @Override - public Set filterRowKeys(Set candidateRowKeys) { - // Current implementation reads all records and filters them. In certain cases, it many be better to: - // 1. Scan a limited subset of keys (min/max range of candidateRowKeys) - // 2. Lookup keys individually (if the size of candidateRowKeys is much less than the total keys in file) - try { - List> allRecords = readAllRecords(); - Set rowKeys = new HashSet<>(); - allRecords.forEach(t -> { - if (candidateRowKeys.contains(t.getFirst())) { - rowKeys.add(t.getFirst()); - } - }); - return rowKeys; - } catch (IOException e) { - throw new HoodieIOException("Failed to read row keys from " + path, e); + public Set filterRowKeys(Set candidateRowKeys) { + return candidateRowKeys.stream().filter(k -> { + try { + return isKeyAvailable(k); + } catch (IOException e) { + LOG.error("Failed to check key availability: " + k); + return false; + } + }).collect(Collectors.toSet()); + } + + @Override + public Map getRecordsByKeys(List rowKeys) throws IOException { + return filterRecordsImpl(new TreeSet<>(rowKeys)); + } + + /** + * Filter records by sorted keys. + *

+ * TODO: Implement single seek and sequential scan till the last candidate key + * instead of repeated seeks. + * + * @param sortedCandidateRowKeys - Sorted set of keys to fetch records for + * @return Map of keys to fetched records + * @throws IOException When the deserialization of records fail + */ + private synchronized Map filterRecordsImpl(TreeSet sortedCandidateRowKeys) throws IOException { + HashMap filteredRecords = new HashMap<>(); + for (String key : sortedCandidateRowKeys) { + Option record = getRecordByKey(key); + if (record.isPresent()) { + filteredRecords.put(key, record.get()); + } } + return filteredRecords; } - public List> readAllRecords(Schema writerSchema, Schema readerSchema) throws IOException { + /** + * Reads all the records with given schema. + * + *

NOTE: This should only be used for testing, + * the records are materialized eagerly into a list and returned, + * use {@code getRecordIterator} where possible. + */ + private List> readAllRecords(Schema writerSchema, Schema readerSchema) { + final Option keyFieldSchema = Option.ofNullable(readerSchema.getField(KEY_FIELD_NAME)); List> recordList = new LinkedList<>(); try { final HFileScanner scanner = reader.getScanner(false, false); if (scanner.seekTo()) { do { Cell c = scanner.getKeyValue(); - byte[] keyBytes = Arrays.copyOfRange(c.getRowArray(), c.getRowOffset(), c.getRowOffset() + c.getRowLength()); - R record = getRecordFromCell(c, writerSchema, readerSchema); - recordList.add(new Pair<>(new String(keyBytes), record)); + final Pair keyAndRecordPair = getRecordFromCell(c, writerSchema, readerSchema, keyFieldSchema); + recordList.add(keyAndRecordPair); } while (scanner.next()); } @@ -169,17 +218,36 @@ public List> readAllRecords(Schema writerSchema, Schema readerSc } } - public List> readAllRecords() throws IOException { - Schema schema = new Schema.Parser().parse(new String(reader.loadFileInfo().get(KEY_SCHEMA.getBytes()))); + /** + * Reads all the records with current schema. + * + *

NOTE: This should only be used for testing, + * the records are materialized eagerly into a list and returned, + * use {@code getRecordIterator} where possible. + */ + public List> readAllRecords() { + Schema schema = getSchema(); return readAllRecords(schema, schema); } + /** + * Reads all the records with current schema and filtering keys. + * + *

NOTE: This should only be used for testing, + * the records are materialized eagerly into a list and returned, + * use {@code getRecordIterator} where possible. + */ public List> readRecords(List keys) throws IOException { - reader.loadFileInfo(); - Schema schema = new Schema.Parser().parse(new String(reader.loadFileInfo().get(KEY_SCHEMA.getBytes()))); - return readRecords(keys, schema); + return readRecords(keys, getSchema()); } + /** + * Reads all the records with given schema and filtering keys. + * + *

NOTE: This should only be used for testing, + * the records are materialized eagerly into a list and returned, + * use {@code getRecordIterator} where possible. + */ public List> readRecords(List keys, Schema schema) throws IOException { this.schema = schema; reader.loadFileInfo(); @@ -193,9 +261,45 @@ public List> readRecords(List keys, Schema schema) throw return records; } + public ClosableIterator getRecordIterator(List keys, Schema schema) throws IOException { + this.schema = schema; + reader.loadFileInfo(); + Iterator iterator = keys.iterator(); + return new ClosableIterator() { + private R next; + @Override + public void close() { + } + + @Override + public boolean hasNext() { + try { + while (iterator.hasNext()) { + Option value = getRecordByKey(iterator.next(), schema); + if (value.isPresent()) { + next = value.get(); + return true; + } + } + return false; + } catch (IOException e) { + throw new HoodieIOException("unable to read next record from hfile ", e); + } + } + + @Override + public R next() { + return next; + } + }; + } + @Override public Iterator getRecordIterator(Schema readerSchema) throws IOException { final HFileScanner scanner = reader.getScanner(false, false); + final Option keyFieldSchema = Option.ofNullable(readerSchema.getField(KEY_FIELD_NAME)); + ValidationUtils.checkState(keyFieldSchema != null, + "Missing key field '" + KEY_FIELD_NAME + "' in the schema!"); return new Iterator() { private R next = null; private boolean eof = false; @@ -206,7 +310,8 @@ public boolean hasNext() { // To handle when hasNext() is called multiple times for idempotency and/or the first time if (this.next == null && !this.eof) { if (!scanner.isSeeked() && scanner.seekTo()) { - this.next = getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema); + final Pair keyAndRecordPair = getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema, keyFieldSchema); + this.next = keyAndRecordPair.getSecond(); } } return this.next != null; @@ -226,7 +331,8 @@ public R next() { } R retVal = this.next; if (scanner.next()) { - this.next = getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema); + final Pair keyAndRecordPair = getRecordFromCell(scanner.getKeyValue(), getSchema(), readerSchema, keyFieldSchema); + this.next = keyAndRecordPair.getSecond(); } else { this.next = null; this.eof = true; @@ -239,9 +345,24 @@ public R next() { }; } + private boolean isKeyAvailable(String key) throws IOException { + final KeyValue kv = new KeyValue(key.getBytes(), null, null, null); + synchronized (this) { + if (keyScanner == null) { + keyScanner = reader.getScanner(false, false); + } + if (keyScanner.seekTo(kv) == 0) { + return true; + } + } + return false; + } + @Override public Option getRecordByKey(String key, Schema readerSchema) throws IOException { byte[] value = null; + final Option keyFieldSchema = Option.ofNullable(readerSchema.getField(KEY_FIELD_NAME)); + ValidationUtils.checkState(keyFieldSchema != null); KeyValue kv = new KeyValue(key.getBytes(), null, null, null); synchronized (this) { @@ -257,16 +378,51 @@ public Option getRecordByKey(String key, Schema readerSchema) throws IOException } if (value != null) { - R record = (R)HoodieAvroUtils.bytesToAvro(value, getSchema(), readerSchema); + R record = deserialize(key.getBytes(), value, getSchema(), readerSchema, keyFieldSchema); return Option.of(record); } return Option.empty(); } - private R getRecordFromCell(Cell c, Schema writerSchema, Schema readerSchema) throws IOException { - byte[] value = Arrays.copyOfRange(c.getValueArray(), c.getValueOffset(), c.getValueOffset() + c.getValueLength()); - return (R)HoodieAvroUtils.bytesToAvro(value, writerSchema, readerSchema); + private Pair getRecordFromCell(Cell cell, Schema writerSchema, Schema readerSchema, Option keyFieldSchema) throws IOException { + final byte[] keyBytes = Arrays.copyOfRange(cell.getRowArray(), cell.getRowOffset(), cell.getRowOffset() + cell.getRowLength()); + final byte[] valueBytes = Arrays.copyOfRange(cell.getValueArray(), cell.getValueOffset(), cell.getValueOffset() + cell.getValueLength()); + R record = deserialize(keyBytes, valueBytes, writerSchema, readerSchema, keyFieldSchema); + return new Pair<>(new String(keyBytes), record); + } + + /** + * Deserialize the record byte array contents to record object. + * + * @param keyBytes - Record key as byte array + * @param valueBytes - Record content as byte array + * @param writerSchema - Writer schema + * @param readerSchema - Reader schema + * @param keyFieldSchema - Key field id in the schema + * @return Deserialized record object + */ + private R deserialize(final byte[] keyBytes, final byte[] valueBytes, Schema writerSchema, Schema readerSchema, + Option keyFieldSchema) throws IOException { + R record = (R) HoodieAvroUtils.bytesToAvro(valueBytes, writerSchema, readerSchema); + materializeRecordIfNeeded(keyBytes, record, keyFieldSchema); + return record; + } + + /** + * Materialize the record for any missing fields, if needed. + * + * @param keyBytes - Key byte array + * @param record - Record object to materialize + * @param keyFieldSchema - Key field id in the schema + */ + private void materializeRecordIfNeeded(final byte[] keyBytes, R record, Option keyFieldSchema) { + if (keyFieldSchema.isPresent()) { + final Object keyObject = record.get(keyFieldSchema.get().pos()); + if (keyObject != null && keyObject.toString().isEmpty()) { + record.put(keyFieldSchema.get().pos(), new String(keyBytes)); + } + } } @Override @@ -288,28 +444,14 @@ public synchronized void close() { } } - static class SeekableByteArrayInputStream extends ByteArrayInputStream implements Seekable, PositionedReadable { + static class SeekableByteArrayInputStream extends ByteBufferBackedInputStream implements Seekable, PositionedReadable { public SeekableByteArrayInputStream(byte[] buf) { super(buf); } @Override public long getPos() throws IOException { - return pos; - } - - @Override - public void seek(long pos) throws IOException { - if (mark != 0) { - throw new IllegalStateException(); - } - - reset(); - long skipped = skip(pos); - - if (skipped != pos) { - throw new IOException(); - } + return getPosition(); } @Override @@ -319,19 +461,7 @@ public boolean seekToNewSource(long targetPos) throws IOException { @Override public int read(long position, byte[] buffer, int offset, int length) throws IOException { - - if (position >= buf.length) { - throw new IllegalArgumentException(); - } - if (position + length > buf.length) { - throw new IllegalArgumentException(); - } - if (length > buffer.length) { - throw new IllegalArgumentException(); - } - - System.arraycopy(buf, (int) position, buffer, offset, length); - return length; + return copyFrom(position, buffer, offset, length); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java index 9ead1ac87ba50..9ad07dfafbf60 100644 --- a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetReader.java @@ -34,9 +34,9 @@ import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.hadoop.ParquetReader; -public class HoodieParquetReader implements HoodieFileReader { - private Path path; - private Configuration conf; +public class HoodieParquetReader implements HoodieFileReader { + private final Path path; + private final Configuration conf; private final BaseFileUtils parquetUtils; public HoodieParquetReader(Configuration configuration, Path path) { @@ -45,6 +45,7 @@ public HoodieParquetReader(Configuration configuration, Path path) { this.parquetUtils = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET); } + @Override public String[] readMinMaxRecordKeys() { return parquetUtils.readMinMaxRecordKeys(conf, path); } @@ -55,15 +56,15 @@ public BloomFilter readBloomFilter() { } @Override - public Set filterRowKeys(Set candidateRowKeys) { + public Set filterRowKeys(Set candidateRowKeys) { return parquetUtils.filterRowKeys(conf, path, candidateRowKeys); } @Override public Iterator getRecordIterator(Schema schema) throws IOException { AvroReadSupport.setAvroReadSchema(conf, schema); - ParquetReader reader = AvroParquetReader.builder(path).withConf(conf).build(); - return new ParquetReaderIterator(reader); + ParquetReader reader = AvroParquetReader.builder(path).withConf(conf).build(); + return new ParquetReaderIterator<>(reader); } @Override diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetStreamWriter.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetStreamWriter.java new file mode 100644 index 0000000000000..a2736018242b6 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieParquetStreamWriter.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io.storage; + +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.HoodieAvroWriteSupport; +import org.apache.hudi.parquet.io.OutputStreamBackedOutputFile; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.api.WriteSupport; +import org.apache.parquet.io.OutputFile; + +import java.io.IOException; + +// TODO(HUDI-3035) unify w/ HoodieParquetWriter +public class HoodieParquetStreamWriter implements AutoCloseable { + + private final ParquetWriter writer; + private final HoodieAvroWriteSupport writeSupport; + + public HoodieParquetStreamWriter(FSDataOutputStream outputStream, + HoodieAvroParquetConfig parquetConfig) throws IOException { + this.writeSupport = parquetConfig.getWriteSupport(); + this.writer = new Builder(new OutputStreamBackedOutputFile(outputStream), writeSupport) + .withWriteMode(ParquetFileWriter.Mode.CREATE) + .withCompressionCodec(parquetConfig.getCompressionCodecName()) + .withRowGroupSize(parquetConfig.getBlockSize()) + .withPageSize(parquetConfig.getPageSize()) + .withDictionaryPageSize(parquetConfig.getPageSize()) + .withDictionaryEncoding(parquetConfig.dictionaryEnabled()) + .withWriterVersion(ParquetWriter.DEFAULT_WRITER_VERSION) + .withConf(parquetConfig.getHadoopConf()) + .build(); + } + + public void writeAvro(String key, R object) throws IOException { + writer.write(object); + writeSupport.add(key); + } + + @Override + public void close() throws IOException { + writer.close(); + } + + private static class Builder extends ParquetWriter.Builder> { + private final WriteSupport writeSupport; + + private Builder(Path file, WriteSupport writeSupport) { + super(file); + this.writeSupport = writeSupport; + } + + private Builder(OutputFile file, WriteSupport writeSupport) { + super(file); + this.writeSupport = writeSupport; + } + + @Override + protected Builder self() { + return this; + } + + @Override + protected WriteSupport getWriteSupport(Configuration conf) { + return writeSupport; + } + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java b/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java index 6a1f761219221..ff182c4c1661f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java +++ b/hudi-common/src/main/java/org/apache/hudi/keygen/constant/KeyGeneratorOptions.java @@ -96,5 +96,28 @@ public class KeyGeneratorOptions extends HoodieConfig { */ @Deprecated public static final String PARTITIONPATH_FIELD_OPT_KEY = PARTITIONPATH_FIELD_NAME.key(); + + /** + * Supported configs. + */ + public static class Config { + + // One value from TimestampType above + public static final String TIMESTAMP_TYPE_FIELD_PROP = "hoodie.deltastreamer.keygen.timebased.timestamp.type"; + public static final String INPUT_TIME_UNIT = + "hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit"; + //This prop can now accept list of input date formats. + public static final String TIMESTAMP_INPUT_DATE_FORMAT_PROP = + "hoodie.deltastreamer.keygen.timebased.input.dateformat"; + public static final String TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP = "hoodie.deltastreamer.keygen.timebased.input.dateformat.list.delimiter.regex"; + public static final String TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.input.timezone"; + public static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP = + "hoodie.deltastreamer.keygen.timebased.output.dateformat"; + //still keeping this prop for backward compatibility so that functionality for existing users does not break. + public static final String TIMESTAMP_TIMEZONE_FORMAT_PROP = + "hoodie.deltastreamer.keygen.timebased.timezone"; + public static final String TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP = "hoodie.deltastreamer.keygen.timebased.output.timezone"; + public static final String DATE_TIME_PARSER_PROP = "hoodie.deltastreamer.keygen.datetime.parser.class"; + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java index ccd421e677651..3c648f38defc6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java @@ -19,6 +19,10 @@ package org.apache.hudi.metadata; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieMetadataBloomFilter; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -30,29 +34,33 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.common.util.hash.ColumnIndexID; +import org.apache.hudi.common.util.hash.FileIndexID; +import org.apache.hudi.common.util.hash.PartitionIndexID; import org.apache.hudi.exception.HoodieMetadataException; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.TreeSet; import java.util.stream.Collectors; public abstract class BaseTableMetadata implements HoodieTableMetadata { private static final Logger LOG = LogManager.getLogger(BaseTableMetadata.class); - static final long MAX_MEMORY_SIZE_IN_BYTES = 1024 * 1024 * 1024; - static final int BUFFER_SIZE = 10 * 1024 * 1024; + public static final long MAX_MEMORY_SIZE_IN_BYTES = 1024 * 1024 * 1024; + public static final int BUFFER_SIZE = 10 * 1024 * 1024; protected final transient HoodieEngineContext engineContext; protected final SerializableConfiguration hadoopConf; @@ -63,7 +71,9 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata { // Directory used for Spillable Map when merging records protected final String spillableMapDirectory; - protected boolean enabled; + protected boolean isMetadataTableEnabled; + protected boolean isBloomFilterIndexEnabled = false; + protected boolean isColumnStatsIndexEnabled = false; protected BaseTableMetadata(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, String dataBasePath, String spillableMapDirectory) { @@ -74,7 +84,7 @@ protected BaseTableMetadata(HoodieEngineContext engineContext, HoodieMetadataCon this.spillableMapDirectory = spillableMapDirectory; this.metadataConfig = metadataConfig; - this.enabled = metadataConfig.enabled(); + this.isMetadataTableEnabled = metadataConfig.enabled(); if (metadataConfig.enableMetrics()) { this.metrics = Option.of(new HoodieMetadataMetrics(Registry.getRegistry("HoodieMetadata"))); } else { @@ -84,16 +94,15 @@ protected BaseTableMetadata(HoodieEngineContext engineContext, HoodieMetadataCon /** * Return the list of partitions in the dataset. - * + *

* If the Metadata Table is enabled, the listing is retrieved from the stored metadata. Otherwise, the list of * partitions is retrieved directly from the underlying {@code FileSystem}. - * + *

* On any errors retrieving the listing from the metadata, defaults to using the file system listings. - * */ @Override public List getAllPartitionPaths() throws IOException { - if (enabled) { + if (isMetadataTableEnabled) { try { return fetchAllPartitionPaths(); } catch (Exception e) { @@ -106,10 +115,10 @@ public List getAllPartitionPaths() throws IOException { /** * Return the list of files in a partition. - * + *

* If the Metadata Table is enabled, the listing is retrieved from the stored metadata. Otherwise, the list of * partitions is retrieved directly from the underlying {@code FileSystem}. - * + *

* On any errors retrieving the listing from the metadata, defaults to using the file system listings. * * @param partitionPath The absolute path of the partition to list @@ -117,7 +126,7 @@ public List getAllPartitionPaths() throws IOException { @Override public FileStatus[] getAllFilesInPartition(Path partitionPath) throws IOException { - if (enabled) { + if (isMetadataTableEnabled) { try { return fetchAllFilesInPartition(partitionPath); } catch (Exception e) { @@ -132,7 +141,7 @@ public FileStatus[] getAllFilesInPartition(Path partitionPath) @Override public Map getAllFilesInPartitions(List partitions) throws IOException { - if (enabled) { + if (isMetadataTableEnabled) { try { List partitionPaths = partitions.stream().map(entry -> new Path(entry)).collect(Collectors.toList()); Map partitionsFilesMap = fetchAllFilesInPartitionPaths(partitionPaths); @@ -146,12 +155,124 @@ public Map getAllFilesInPartitions(List partitions .getAllFilesInPartitions(partitions); } + @Override + public Option getBloomFilter(final String partitionName, final String fileName) + throws HoodieMetadataException { + if (!isBloomFilterIndexEnabled) { + LOG.error("Metadata bloom filter index is disabled!"); + return Option.empty(); + } + + final Pair partitionFileName = Pair.of(partitionName, fileName); + Map, ByteBuffer> bloomFilters = getBloomFilters(Collections.singletonList(partitionFileName)); + if (bloomFilters.isEmpty()) { + LOG.error("Meta index: missing bloom filter for partition: " + partitionName + ", file: " + fileName); + return Option.empty(); + } + + ValidationUtils.checkState(bloomFilters.containsKey(partitionFileName)); + return Option.of(bloomFilters.get(partitionFileName)); + } + + @Override + public Map, ByteBuffer> getBloomFilters(final List> partitionNameFileNameList) + throws HoodieMetadataException { + if (!isBloomFilterIndexEnabled) { + LOG.error("Metadata bloom filter index is disabled!"); + return Collections.emptyMap(); + } + if (partitionNameFileNameList.isEmpty()) { + return Collections.emptyMap(); + } + + HoodieTimer timer = new HoodieTimer().startTimer(); + Set partitionIDFileIDSortedStrings = new TreeSet<>(); + Map> fileToKeyMap = new HashMap<>(); + partitionNameFileNameList.forEach(partitionNameFileNamePair -> { + final String bloomFilterIndexKey = HoodieMetadataPayload.getBloomFilterIndexKey( + new PartitionIndexID(partitionNameFileNamePair.getLeft()), new FileIndexID(partitionNameFileNamePair.getRight())); + partitionIDFileIDSortedStrings.add(bloomFilterIndexKey); + fileToKeyMap.put(bloomFilterIndexKey, partitionNameFileNamePair); + } + ); + + List partitionIDFileIDStrings = new ArrayList<>(partitionIDFileIDSortedStrings); + List>>> hoodieRecordList = + getRecordsByKeys(partitionIDFileIDStrings, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath()); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_BLOOM_FILTERS_METADATA_STR, + (timer.endTimer() / partitionIDFileIDStrings.size()))); + + Map, ByteBuffer> partitionFileToBloomFilterMap = new HashMap<>(); + for (final Pair>> entry : hoodieRecordList) { + if (entry.getRight().isPresent()) { + final Option bloomFilterMetadata = + entry.getRight().get().getData().getBloomFilterMetadata(); + if (bloomFilterMetadata.isPresent()) { + if (!bloomFilterMetadata.get().getIsDeleted()) { + ValidationUtils.checkState(fileToKeyMap.containsKey(entry.getLeft())); + partitionFileToBloomFilterMap.put(fileToKeyMap.get(entry.getLeft()), bloomFilterMetadata.get().getBloomFilter()); + } + } else { + LOG.error("Meta index bloom filter missing for: " + fileToKeyMap.get(entry.getLeft())); + } + } + } + return partitionFileToBloomFilterMap; + } + + @Override + public Map, HoodieMetadataColumnStats> getColumnStats(final List> partitionNameFileNameList, final String columnName) + throws HoodieMetadataException { + if (!isColumnStatsIndexEnabled) { + LOG.error("Metadata column stats index is disabled!"); + return Collections.emptyMap(); + } + + Map> columnStatKeyToFileNameMap = new HashMap<>(); + TreeSet sortedKeys = new TreeSet<>(); + final ColumnIndexID columnIndexID = new ColumnIndexID(columnName); + for (Pair partitionNameFileNamePair : partitionNameFileNameList) { + final String columnStatsIndexKey = HoodieMetadataPayload.getColumnStatsIndexKey( + new PartitionIndexID(partitionNameFileNamePair.getLeft()), + new FileIndexID(partitionNameFileNamePair.getRight()), + columnIndexID); + sortedKeys.add(columnStatsIndexKey); + columnStatKeyToFileNameMap.put(columnStatsIndexKey, partitionNameFileNamePair); + } + + List columnStatKeys = new ArrayList<>(sortedKeys); + HoodieTimer timer = new HoodieTimer().startTimer(); + List>>> hoodieRecordList = + getRecordsByKeys(columnStatKeys, MetadataPartitionType.COLUMN_STATS.getPartitionPath()); + metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_METADATA_STR, timer.endTimer())); + + Map, HoodieMetadataColumnStats> fileToColumnStatMap = new HashMap<>(); + for (final Pair>> entry : hoodieRecordList) { + if (entry.getRight().isPresent()) { + final Option columnStatMetadata = + entry.getRight().get().getData().getColumnStatMetadata(); + if (columnStatMetadata.isPresent()) { + if (!columnStatMetadata.get().getIsDeleted()) { + ValidationUtils.checkState(columnStatKeyToFileNameMap.containsKey(entry.getLeft())); + final Pair partitionFileNamePair = columnStatKeyToFileNameMap.get(entry.getLeft()); + ValidationUtils.checkState(!fileToColumnStatMap.containsKey(partitionFileNamePair)); + fileToColumnStatMap.put(partitionFileNamePair, columnStatMetadata.get()); + } + } else { + LOG.error("Meta index column stats missing for: " + entry.getLeft()); + } + } + } + return fileToColumnStatMap; + } + /** * Returns a list of all partitions. */ protected List fetchAllPartitionPaths() throws IOException { HoodieTimer timer = new HoodieTimer().startTimer(); - Option> hoodieRecord = getRecordByKey(RECORDKEY_PARTITION_LIST, MetadataPartitionType.FILES.partitionPath()); + Option> hoodieRecord = getRecordByKey(RECORDKEY_PARTITION_LIST, + MetadataPartitionType.FILES.getPartitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_PARTITIONS_STR, timer.endTimer())); List partitions = Collections.emptyList(); @@ -181,7 +302,8 @@ FileStatus[] fetchAllFilesInPartition(Path partitionPath) throws IOException { } HoodieTimer timer = new HoodieTimer().startTimer(); - Option> hoodieRecord = getRecordByKey(partitionName, MetadataPartitionType.FILES.partitionPath()); + Option> hoodieRecord = getRecordByKey(partitionName, + MetadataPartitionType.FILES.getPartitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer())); FileStatus[] statuses = {}; @@ -215,7 +337,7 @@ Map fetchAllFilesInPartitionPaths(List partitionPath HoodieTimer timer = new HoodieTimer().startTimer(); List>>> partitionsFileStatus = - getRecordsByKeys(new ArrayList<>(partitionInfo.keySet()), MetadataPartitionType.FILES.partitionPath()); + getRecordsByKeys(new ArrayList<>(partitionInfo.keySet()), MetadataPartitionType.FILES.getPartitionPath()); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer())); Map result = new HashMap<>(); @@ -231,18 +353,18 @@ Map fetchAllFilesInPartitionPaths(List partitionPath } /** - * May be handle spurious deletes. Depending on config, throw an exception or log a warn msg. + * Maybe handle spurious deletes. Depending on config, throw an exception or log a warn msg. * @param hoodieRecord instance of {@link HoodieRecord} of interest. * @param partitionName partition name of interest. */ private void mayBeHandleSpuriousDeletes(Option> hoodieRecord, String partitionName) { if (!hoodieRecord.get().getData().getDeletions().isEmpty()) { - if (!metadataConfig.ignoreSpuriousDeletes()) { + if (metadataConfig.ignoreSpuriousDeletes()) { + LOG.warn("Metadata record for " + partitionName + " encountered some files to be deleted which was not added before. " + + "Ignoring the spurious deletes as the `" + HoodieMetadataConfig.IGNORE_SPURIOUS_DELETES.key() + "` config is set to true"); + } else { throw new HoodieMetadataException("Metadata record for " + partitionName + " is inconsistent: " + hoodieRecord.get().getData()); - } else { - LOG.warn("Metadata record for " + partitionName + " encountered some files to be deleted which was not added before. " - + "Ignoring the spurious deletes as the `" + HoodieMetadataConfig.IGNORE_SPURIOUS_DELETES.key() + "` config is set to false"); } } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java index d05b95dfdb495..a4e5ea3539f17 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java @@ -18,6 +18,7 @@ package org.apache.hudi.metadata; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; @@ -29,8 +30,10 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hudi.exception.HoodieMetadataException; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -139,4 +142,21 @@ public void close() throws Exception { public void reset() { // no-op } + + public Option getBloomFilter(final String partitionName, final String fileName) + throws HoodieMetadataException { + throw new HoodieMetadataException("Unsupported operation: getBloomFilter for " + fileName); + } + + @Override + public Map, ByteBuffer> getBloomFilters(final List> partitionNameFileNameList) + throws HoodieMetadataException { + throw new HoodieMetadataException("Unsupported operation: getBloomFilters!"); + } + + @Override + public Map, HoodieMetadataColumnStats> getColumnStats(final List> partitionNameFileNameList, final String columnName) + throws HoodieMetadataException { + throw new HoodieMetadataException("Unsupported operation: getColumnsStats!"); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java index c9e538f72eaa0..7b4dbd9a0b935 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java @@ -18,6 +18,9 @@ package org.apache.hudi.metadata; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.avro.model.HoodieRestoreMetadata; @@ -27,6 +30,7 @@ import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; @@ -48,10 +52,6 @@ import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.io.storage.HoodieFileReader; import org.apache.hudi.io.storage.HoodieFileReaderFactory; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -64,6 +64,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** @@ -80,8 +81,9 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { // should we reuse the open file handles, across calls private final boolean reuse; - // Readers for latest file slice corresponding to file groups in the metadata partition of interest - private Map> partitionReaders = new ConcurrentHashMap<>(); + // Readers for the latest file slice corresponding to file groups in the metadata partition + private Map, Pair> partitionReaders = + new ConcurrentHashMap<>(); public HoodieBackedTableMetadata(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig, String datasetBasePath, String spillableMapDirectory) { @@ -97,7 +99,7 @@ public HoodieBackedTableMetadata(HoodieEngineContext engineContext, HoodieMetada private void initIfNeeded() { this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(dataBasePath); - if (!enabled) { + if (!isMetadataTableEnabled) { if (!HoodieTableMetadata.isMetadataTable(metadataBasePath)) { LOG.info("Metadata table is disabled."); } @@ -105,14 +107,16 @@ private void initIfNeeded() { try { this.metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(metadataBasePath).build(); this.metadataTableConfig = metadataMetaClient.getTableConfig(); + this.isBloomFilterIndexEnabled = metadataConfig.isBloomFilterIndexEnabled(); + this.isColumnStatsIndexEnabled = metadataConfig.isColumnStatsIndexEnabled(); } catch (TableNotFoundException e) { LOG.warn("Metadata table was not found at path " + metadataBasePath); - this.enabled = false; + this.isMetadataTableEnabled = false; this.metadataMetaClient = null; this.metadataTableConfig = null; } catch (Exception e) { LOG.error("Failed to initialize metadata table at path " + metadataBasePath, e); - this.enabled = false; + this.isMetadataTableEnabled = false; this.metadataMetaClient = null; this.metadataTableConfig = null; } @@ -125,30 +129,43 @@ protected Option> getRecordByKey(String key, return recordsByKeys.size() == 0 ? Option.empty() : recordsByKeys.get(0).getValue(); } - protected List>>> getRecordsByKeys(List keys, String partitionName) { - Pair readers = openReadersIfNeeded(keys.get(0), partitionName); - try { - List timings = new ArrayList<>(); - HoodieFileReader baseFileReader = readers.getKey(); - HoodieMetadataMergedLogRecordReader logRecordScanner = readers.getRight(); + @Override + protected List>>> getRecordsByKeys(List keys, + String partitionName) { + Map, List> partitionFileSliceToKeysMap = getPartitionFileSliceToKeysMapping(partitionName, keys); + List>>> result = new ArrayList<>(); + AtomicInteger fileSlicesKeysCount = new AtomicInteger(); + partitionFileSliceToKeysMap.forEach((partitionFileSlicePair, fileSliceKeys) -> { + Pair readers = openReadersIfNeeded(partitionName, + partitionFileSlicePair.getRight()); + try { + List timings = new ArrayList<>(); + HoodieFileReader baseFileReader = readers.getKey(); + HoodieMetadataMergedLogRecordReader logRecordScanner = readers.getRight(); - if (baseFileReader == null && logRecordScanner == null) { - return Collections.emptyList(); - } + if (baseFileReader == null && logRecordScanner == null) { + return; + } - // local map to assist in merging with base file records - Map>> logRecords = readLogRecords(logRecordScanner, keys, timings); - List>>> result = readFromBaseAndMergeWithLogRecords( - baseFileReader, keys, logRecords, timings, partitionName); - LOG.info(String.format("Metadata read for %s keys took [baseFileRead, logMerge] %s ms", keys.size(), timings)); - return result; - } catch (IOException ioe) { - throw new HoodieIOException("Error merging records from metadata table for " + keys.size() + " key : ", ioe); - } finally { - if (!reuse) { - close(partitionName); + // local map to assist in merging with base file records + Map>> logRecords = readLogRecords(logRecordScanner, + fileSliceKeys, timings); + result.addAll(readFromBaseAndMergeWithLogRecords(baseFileReader, fileSliceKeys, logRecords, + timings, partitionName)); + LOG.debug(String.format("Metadata read for %s keys took [baseFileRead, logMerge] %s ms", + fileSliceKeys.size(), timings)); + fileSlicesKeysCount.addAndGet(fileSliceKeys.size()); + } catch (IOException ioe) { + throw new HoodieIOException("Error merging records from metadata table for " + keys.size() + " key : ", ioe); + } finally { + if (!reuse) { + close(Pair.of(partitionFileSlicePair.getLeft(), partitionFileSlicePair.getRight().getFileId())); + } } - } + }); + + ValidationUtils.checkState(keys.size() == fileSlicesKeysCount.get()); + return result; } private Map>> readLogRecords(HoodieMetadataMergedLogRecordReader logRecordScanner, @@ -190,16 +207,16 @@ private List>>> readFrom // Retrieve record from base file if (baseFileReader != null) { HoodieTimer readTimer = new HoodieTimer(); + Map baseFileRecords = baseFileReader.getRecordsByKeys(keys); for (String key : keys) { readTimer.startTimer(); - Option baseRecord = baseFileReader.getRecordByKey(key); - if (baseRecord.isPresent()) { - hoodieRecord = getRecord(baseRecord, partitionName); + if (baseFileRecords.containsKey(key)) { + hoodieRecord = getRecord(Option.of(baseFileRecords.get(key)), partitionName); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer())); // merge base file record w/ log record if present if (logRecords.containsKey(key) && logRecords.get(key).isPresent()) { HoodieRecordPayload mergedPayload = logRecords.get(key).get().getData().preCombine(hoodieRecord.getData()); - result.add(Pair.of(key, Option.of(new HoodieRecord(hoodieRecord.getKey(), mergedPayload)))); + result.add(Pair.of(key, Option.of(new HoodieAvroRecord(hoodieRecord.getKey(), mergedPayload)))); } else { // only base record result.add(Pair.of(key, Option.of(hoodieRecord))); @@ -233,38 +250,54 @@ private HoodieRecord getRecord(Option base } /** - * Returns a new pair of readers to the base and log files. + * Get the latest file slices for the interested keys in a given partition. + * + * @param partitionName - Partition to get the file slices from + * @param keys - Interested keys + * @return FileSlices for the keys */ - private Pair openReadersIfNeeded(String key, String partitionName) { - return partitionReaders.computeIfAbsent(partitionName, k -> { - try { - final long baseFileOpenMs; - final long logScannerOpenMs; - HoodieFileReader baseFileReader = null; - HoodieMetadataMergedLogRecordReader logRecordScanner = null; + private Map, List> getPartitionFileSliceToKeysMapping(final String partitionName, final List keys) { + // Metadata is in sync till the latest completed instant on the dataset + List latestFileSlices = + HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, partitionName); + + Map, List> partitionFileSliceToKeysMap = new HashMap<>(); + for (String key : keys) { + final FileSlice slice = latestFileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(key, + latestFileSlices.size())); + final Pair partitionNameFileSlicePair = Pair.of(partitionName, slice); + partitionFileSliceToKeysMap.computeIfAbsent(partitionNameFileSlicePair, k -> new ArrayList<>()).add(key); + } + return partitionFileSliceToKeysMap; + } - // Metadata is in sync till the latest completed instant on the dataset + /** + * Create a file reader and the record scanner for a given partition and file slice + * if readers are not already available. + * + * @param partitionName - Partition name + * @param slice - The file slice to open readers for + * @return File reader and the record scanner pair for the requested file slice + */ + private Pair openReadersIfNeeded(String partitionName, FileSlice slice) { + return partitionReaders.computeIfAbsent(Pair.of(partitionName, slice.getFileId()), k -> { + try { HoodieTimer timer = new HoodieTimer().startTimer(); - List latestFileSlices = HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, partitionName); - if (latestFileSlices.size() == 0) { - // empty partition - return Pair.of(null, null); - } - ValidationUtils.checkArgument(latestFileSlices.size() == 1, String.format("Invalid number of file slices: found=%d, required=%d", latestFileSlices.size(), 1)); - final FileSlice slice = latestFileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(key, latestFileSlices.size())); // Open base file reader Pair baseFileReaderOpenTimePair = getBaseFileReader(slice, timer); - baseFileReader = baseFileReaderOpenTimePair.getKey(); - baseFileOpenMs = baseFileReaderOpenTimePair.getValue(); + HoodieFileReader baseFileReader = baseFileReaderOpenTimePair.getKey(); + final long baseFileOpenMs = baseFileReaderOpenTimePair.getValue(); // Open the log record scanner using the log files from the latest file slice - Pair logRecordScannerOpenTimePair = getLogRecordScanner(slice, - partitionName); - logRecordScanner = logRecordScannerOpenTimePair.getKey(); - logScannerOpenMs = logRecordScannerOpenTimePair.getValue(); - - metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, baseFileOpenMs + logScannerOpenMs)); + List logFiles = slice.getLogFiles().collect(Collectors.toList()); + Pair logRecordScannerOpenTimePair = + getLogRecordScanner(logFiles, partitionName); + HoodieMetadataMergedLogRecordReader logRecordScanner = logRecordScannerOpenTimePair.getKey(); + final long logScannerOpenMs = logRecordScannerOpenTimePair.getValue(); + + metrics.ifPresent(metrics -> metrics.updateMetrics(HoodieMetadataMetrics.SCAN_STR, + +baseFileOpenMs + logScannerOpenMs)); return Pair.of(baseFileReader, logRecordScanner); } catch (IOException e) { throw new HoodieIOException("Error opening readers for metadata table partition " + partitionName, e); @@ -312,9 +345,9 @@ private Set getValidInstantTimestamps() { return validInstantTimestamps; } - private Pair getLogRecordScanner(FileSlice slice, String partitionName) { + public Pair getLogRecordScanner(List logFiles, String partitionName) { HoodieTimer timer = new HoodieTimer().startTimer(); - List logFilePaths = slice.getLogFiles() + List sortedLogFilePaths = logFiles.stream() .sorted(HoodieLogFile.getLogFileComparator()) .map(o -> o.getPath().toString()) .collect(Collectors.toList()); @@ -332,7 +365,7 @@ private Pair getLogRecordScanner(File HoodieMetadataMergedLogRecordReader logRecordScanner = HoodieMetadataMergedLogRecordReader.newBuilder() .withFileSystem(metadataMetaClient.getFs()) .withBasePath(metadataBasePath) - .withLogFilePaths(logFilePaths) + .withLogFilePaths(sortedLogFilePaths) .withReaderSchema(schema) .withLatestInstantTime(latestMetadataInstantTime) .withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES) @@ -347,7 +380,7 @@ private Pair getLogRecordScanner(File Long logScannerOpenMs = timer.endTimer(); LOG.info(String.format("Opened %d metadata log files (dataset instant=%s, metadata instant=%s) in %d ms", - logFilePaths.size(), getLatestDataInstantTime(), latestMetadataInstantTime, logScannerOpenMs)); + sortedLogFilePaths.size(), getLatestDataInstantTime(), latestMetadataInstantTime, logScannerOpenMs)); return Pair.of(logRecordScanner, logScannerOpenMs); } @@ -382,14 +415,20 @@ private List getRollbackedCommits(HoodieInstant instant, HoodieActiveTim @Override public void close() { - for (String partitionName : partitionReaders.keySet()) { - close(partitionName); + for (Pair partitionFileSlicePair : partitionReaders.keySet()) { + close(partitionFileSlicePair); } partitionReaders.clear(); } - private synchronized void close(String partitionName) { - Pair readers = partitionReaders.remove(partitionName); + /** + * Close the file reader and the record scanner for the given file slice. + * + * @param partitionFileSlicePair - Partition and FileSlice + */ + private synchronized void close(Pair partitionFileSlicePair) { + Pair readers = + partitionReaders.remove(partitionFileSlicePair); if (readers != null) { try { if (readers.getKey() != null) { @@ -405,7 +444,7 @@ private synchronized void close(String partitionName) { } public boolean enabled() { - return enabled; + return isMetadataTableEnabled; } public SerializableConfiguration getHadoopConf() { diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java index 01c8d05e9b220..4f616c362fbf6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMergedLogRecordReader.java @@ -18,32 +18,32 @@ package org.apache.hudi.metadata; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Set; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.FileSystem; - import org.apache.hudi.common.config.HoodieMetadataConfig; -import org.apache.hudi.common.table.HoodieTableConfig; -import org.apache.hudi.common.util.SpillableMapUtils; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.log.InstantRange; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.SpillableMapUtils; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.common.util.collection.Pair; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; + /** * A {@code HoodieMergedLogRecordScanner} implementation which only merged records matching providing keys. This is * useful in limiting memory usage when only a small subset of updates records are to be read. @@ -87,7 +87,7 @@ protected void processNextDeletedKey(HoodieKey hoodieKey) { } @Override - protected HoodieRecord createHoodieRecord(final IndexedRecord rec, final HoodieTableConfig hoodieTableConfig, + protected HoodieAvroRecord createHoodieRecord(final IndexedRecord rec, final HoodieTableConfig hoodieTableConfig, final String payloadClassFQN, final String preCombineField, final boolean withOperationField, final Option> simpleKeyGenFields, @@ -116,7 +116,7 @@ public static HoodieMetadataMergedLogRecordReader.Builder newBuilder() { * @param key Key of the record to retrieve * @return {@code HoodieRecord} if key was found else {@code Option.empty()} */ - public List>>> getRecordByKey(String key) { + public synchronized List>>> getRecordByKey(String key) { return Collections.singletonList(Pair.of(key, Option.ofNullable((HoodieRecord) records.get(key)))); } @@ -139,7 +139,7 @@ public synchronized List @Override protected String getKeyField() { - return HoodieMetadataPayload.SCHEMA_FIELD_ID_KEY; + return HoodieMetadataPayload.KEY_FIELD_NAME; } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java index 2efc96c6f3dee..fe8612c42e802 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataMetrics.java @@ -41,6 +41,8 @@ public class HoodieMetadataMetrics implements Serializable { // Metric names public static final String LOOKUP_PARTITIONS_STR = "lookup_partitions"; public static final String LOOKUP_FILES_STR = "lookup_files"; + public static final String LOOKUP_BLOOM_FILTERS_METADATA_STR = "lookup_meta_index_bloom_filters"; + public static final String LOOKUP_COLUMN_STATS_METADATA_STR = "lookup_meta_index_column_ranges"; public static final String SCAN_STR = "scan"; public static final String BASEFILE_READ_STR = "basefile_read"; public static final String INITIALIZE_STR = "initialize"; @@ -77,7 +79,7 @@ private Map getStats(HoodieTableFileSystemView fsView, boolean d Map stats = new HashMap<>(); // Total size of the metadata and count of base/log files - for (String metadataPartition : MetadataPartitionType.all()) { + for (String metadataPartition : MetadataPartitionType.allPaths()) { List latestSlices = fsView.getLatestFileSlices(metadataPartition).collect(Collectors.toList()); // Total size of the metadata and count of base/log files diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java index 0b0d144a6e7e9..221b52e77e674 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java @@ -18,88 +18,199 @@ package org.apache.hudi.metadata; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieMetadataBloomFilter; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; import org.apache.hudi.avro.model.HoodieMetadataFileInfo; import org.apache.hudi.avro.model.HoodieMetadataRecord; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.hash.ColumnIndexID; +import org.apache.hudi.common.util.hash.FileIndexID; +import org.apache.hudi.common.util.hash.PartitionIndexID; import org.apache.hudi.exception.HoodieMetadataException; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; +import org.apache.hudi.io.storage.HoodieHFileReader; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.TypeUtils.unsafeCast; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; +import static org.apache.hudi.common.util.ValidationUtils.checkState; import static org.apache.hudi.metadata.HoodieTableMetadata.RECORDKEY_PARTITION_LIST; /** - * This is a payload which saves information about a single entry in the Metadata Table. - * - * The type of the entry is determined by the "type" saved within the record. The following types of entries are saved: - * - * 1. List of partitions: There is a single such record - * key="__all_partitions__" - * - * 2. List of files in a Partition: There is one such record for each partition - * key=Partition name - * - * During compaction on the table, the deletions are merged with additions and hence pruned. - * - * Metadata Table records are saved with the schema defined in HoodieMetadata.avsc. This class encapsulates the - * HoodieMetadataRecord for ease of operations. + * MetadataTable records are persisted with the schema defined in HoodieMetadata.avsc. + * This class represents the payload for the MetadataTable. + *

+ * This single metadata payload is shared by all the partitions under the metadata table. + * The partition specific records are determined by the field "type" saved within the record. + * The following types are supported: + *

+ * METADATA_TYPE_PARTITION_LIST (1): + * -- List of all partitions. There is a single such record + * -- key = @{@link HoodieTableMetadata#RECORDKEY_PARTITION_LIST} + *

+ * METADATA_TYPE_FILE_LIST (2): + * -- List of all files in a partition. There is one such record for each partition + * -- key = partition name + *

+ * METADATA_TYPE_COLUMN_STATS (3): + * -- This is an index for column stats in the table + *

+ * METADATA_TYPE_BLOOM_FILTER (4): + * -- This is an index for base file bloom filters. This is a map of FileID to its BloomFilter byte[]. + *

+ * During compaction on the table, the deletions are merged with additions and hence records are pruned. */ public class HoodieMetadataPayload implements HoodieRecordPayload { + // Type of the record. This can be an enum in the schema but Avro1.8 + // has a bug - https://issues.apache.org/jira/browse/AVRO-1810 + protected static final int METADATA_TYPE_PARTITION_LIST = 1; + protected static final int METADATA_TYPE_FILE_LIST = 2; + protected static final int METADATA_TYPE_COLUMN_STATS = 3; + protected static final int METADATA_TYPE_BLOOM_FILTER = 4; + // HoodieMetadata schema field ids - public static final String SCHEMA_FIELD_ID_KEY = "key"; - public static final String SCHEMA_FIELD_ID_TYPE = "type"; - public static final String SCHEMA_FIELD_ID_METADATA = "filesystemMetadata"; + public static final String KEY_FIELD_NAME = HoodieHFileReader.KEY_FIELD_NAME; + public static final String SCHEMA_FIELD_NAME_TYPE = "type"; + public static final String SCHEMA_FIELD_NAME_METADATA = "filesystemMetadata"; + public static final String SCHEMA_FIELD_ID_COLUMN_STATS = "ColumnStatsMetadata"; + public static final String SCHEMA_FIELD_ID_BLOOM_FILTER = "BloomFilterMetadata"; + + // HoodieMetadata bloom filter payload field ids + private static final String FIELD_IS_DELETED = "isDeleted"; + private static final String BLOOM_FILTER_FIELD_TYPE = "type"; + private static final String BLOOM_FILTER_FIELD_TIMESTAMP = "timestamp"; + private static final String BLOOM_FILTER_FIELD_BLOOM_FILTER = "bloomFilter"; + private static final String BLOOM_FILTER_FIELD_IS_DELETED = FIELD_IS_DELETED; - // Type of the record - // This can be an enum in the schema but Avro 1.8 has a bug - https://issues.apache.org/jira/browse/AVRO-1810 - private static final int PARTITION_LIST = 1; - private static final int FILE_LIST = 2; + // HoodieMetadata column stats payload field ids + private static final String COLUMN_STATS_FIELD_MIN_VALUE = "minValue"; + private static final String COLUMN_STATS_FIELD_MAX_VALUE = "maxValue"; + private static final String COLUMN_STATS_FIELD_NULL_COUNT = "nullCount"; + private static final String COLUMN_STATS_FIELD_VALUE_COUNT = "valueCount"; + private static final String COLUMN_STATS_FIELD_TOTAL_SIZE = "totalSize"; + private static final String COLUMN_STATS_FIELD_RESOURCE_NAME = "fileName"; + private static final String COLUMN_STATS_FIELD_TOTAL_UNCOMPRESSED_SIZE = "totalUncompressedSize"; + private static final String COLUMN_STATS_FIELD_IS_DELETED = FIELD_IS_DELETED; private String key = null; private int type = 0; private Map filesystemMetadata = null; + private HoodieMetadataBloomFilter bloomFilterMetadata = null; + private HoodieMetadataColumnStats columnStatMetadata = null; public HoodieMetadataPayload(GenericRecord record, Comparable orderingVal) { this(Option.of(record)); } - public HoodieMetadataPayload(Option record) { - if (record.isPresent()) { + public HoodieMetadataPayload(Option recordOpt) { + if (recordOpt.isPresent()) { + GenericRecord record = recordOpt.get(); // This can be simplified using SpecificData.deepcopy once this bug is fixed // https://issues.apache.org/jira/browse/AVRO-1811 - key = record.get().get(SCHEMA_FIELD_ID_KEY).toString(); - type = (int) record.get().get(SCHEMA_FIELD_ID_TYPE); - if (record.get().get(SCHEMA_FIELD_ID_METADATA) != null) { - filesystemMetadata = (Map) record.get().get("filesystemMetadata"); + // + // NOTE: {@code HoodieMetadataRecord} has to always carry both "key" nad "type" fields + // for it to be handled appropriately, therefore these fields have to be reflected + // in any (read-)projected schema + key = record.get(KEY_FIELD_NAME).toString(); + type = (int) record.get(SCHEMA_FIELD_NAME_TYPE); + + Map metadata = getNestedFieldValue(record, SCHEMA_FIELD_NAME_METADATA); + if (metadata != null) { + filesystemMetadata = metadata; filesystemMetadata.keySet().forEach(k -> { GenericRecord v = filesystemMetadata.get(k); - filesystemMetadata.put(k.toString(), new HoodieMetadataFileInfo((Long) v.get("size"), (Boolean) v.get("isDeleted"))); + filesystemMetadata.put(k, new HoodieMetadataFileInfo((Long) v.get("size"), (Boolean) v.get("isDeleted"))); }); } + + if (type == METADATA_TYPE_BLOOM_FILTER) { + GenericRecord bloomFilterRecord = getNestedFieldValue(record, SCHEMA_FIELD_ID_BLOOM_FILTER); + // NOTE: Only legitimate reason for {@code BloomFilterMetadata} to not be present is when + // it's not been read from the storage (ie it's not been a part of projected schema). + // Otherwise, it has to be present or the record would be considered invalid + if (bloomFilterRecord == null) { + checkArgument(record.getSchema().getField(SCHEMA_FIELD_ID_BLOOM_FILTER) == null, + String.format("Valid %s record expected for type: %s", SCHEMA_FIELD_ID_BLOOM_FILTER, METADATA_TYPE_COLUMN_STATS)); + } else { + bloomFilterMetadata = new HoodieMetadataBloomFilter( + (String) bloomFilterRecord.get(BLOOM_FILTER_FIELD_TYPE), + (String) bloomFilterRecord.get(BLOOM_FILTER_FIELD_TIMESTAMP), + (ByteBuffer) bloomFilterRecord.get(BLOOM_FILTER_FIELD_BLOOM_FILTER), + (Boolean) bloomFilterRecord.get(BLOOM_FILTER_FIELD_IS_DELETED) + ); + } + } + + if (type == METADATA_TYPE_COLUMN_STATS) { + GenericRecord columnStatsRecord = getNestedFieldValue(record, SCHEMA_FIELD_ID_COLUMN_STATS); + // NOTE: Only legitimate reason for {@code ColumnStatsMetadata} to not be present is when + // it's not been read from the storage (ie it's not been a part of projected schema). + // Otherwise, it has to be present or the record would be considered invalid + if (columnStatsRecord == null) { + checkArgument(record.getSchema().getField(SCHEMA_FIELD_ID_COLUMN_STATS) == null, + String.format("Valid %s record expected for type: %s", SCHEMA_FIELD_ID_COLUMN_STATS, METADATA_TYPE_COLUMN_STATS)); + } else { + columnStatMetadata = HoodieMetadataColumnStats.newBuilder() + .setFileName((String) columnStatsRecord.get(COLUMN_STATS_FIELD_RESOURCE_NAME)) + .setMinValue((String) columnStatsRecord.get(COLUMN_STATS_FIELD_MIN_VALUE)) + .setMaxValue((String) columnStatsRecord.get(COLUMN_STATS_FIELD_MAX_VALUE)) + .setValueCount((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_VALUE_COUNT)) + .setNullCount((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_NULL_COUNT)) + .setTotalSize((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_TOTAL_SIZE)) + .setTotalUncompressedSize((Long) columnStatsRecord.get(COLUMN_STATS_FIELD_TOTAL_UNCOMPRESSED_SIZE)) + .setIsDeleted((Boolean) columnStatsRecord.get(COLUMN_STATS_FIELD_IS_DELETED)) + .build(); + } + } } } private HoodieMetadataPayload(String key, int type, Map filesystemMetadata) { + this(key, type, filesystemMetadata, null, null); + } + + private HoodieMetadataPayload(String key, HoodieMetadataBloomFilter metadataBloomFilter) { + this(key, METADATA_TYPE_BLOOM_FILTER, null, metadataBloomFilter, null); + } + + private HoodieMetadataPayload(String key, HoodieMetadataColumnStats columnStats) { + this(key, METADATA_TYPE_COLUMN_STATS, null, null, columnStats); + } + + protected HoodieMetadataPayload(String key, int type, + Map filesystemMetadata, + HoodieMetadataBloomFilter metadataBloomFilter, + HoodieMetadataColumnStats columnStats) { this.key = key; this.type = type; this.filesystemMetadata = filesystemMetadata; + this.bloomFilterMetadata = metadataBloomFilter; + this.columnStatMetadata = columnStats; } /** @@ -109,69 +220,122 @@ private HoodieMetadataPayload(String key, int type, Map createPartitionListRecord(List partitions) { Map fileInfo = new HashMap<>(); - partitions.forEach(partition -> fileInfo.put(partition, new HoodieMetadataFileInfo(0L, false))); + partitions.forEach(partition -> fileInfo.put(partition, new HoodieMetadataFileInfo(0L, false))); - HoodieKey key = new HoodieKey(RECORDKEY_PARTITION_LIST, MetadataPartitionType.FILES.partitionPath()); - HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), PARTITION_LIST, fileInfo); - return new HoodieRecord<>(key, payload); + HoodieKey key = new HoodieKey(RECORDKEY_PARTITION_LIST, MetadataPartitionType.FILES.getPartitionPath()); + HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_PARTITION_LIST, + fileInfo); + return new HoodieAvroRecord<>(key, payload); } /** * Create and return a {@code HoodieMetadataPayload} to save list of files within a partition. * - * @param partition The name of the partition - * @param filesAdded Mapping of files to their sizes for files which have been added to this partition + * @param partition The name of the partition + * @param filesAdded Mapping of files to their sizes for files which have been added to this partition * @param filesDeleted List of files which have been deleted from this partition */ public static HoodieRecord createPartitionFilesRecord(String partition, - Option> filesAdded, Option> filesDeleted) { + Option> filesAdded, + Option> filesDeleted) { Map fileInfo = new HashMap<>(); filesAdded.ifPresent( m -> m.forEach((filename, size) -> fileInfo.put(filename, new HoodieMetadataFileInfo(size, false)))); filesDeleted.ifPresent( - m -> m.forEach(filename -> fileInfo.put(filename, new HoodieMetadataFileInfo(0L, true)))); + m -> m.forEach(filename -> fileInfo.put(filename, new HoodieMetadataFileInfo(0L, true)))); - HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.partitionPath()); - HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), FILE_LIST, fileInfo); - return new HoodieRecord<>(key, payload); + HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath()); + HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo); + return new HoodieAvroRecord<>(key, payload); + } + + /** + * Create bloom filter metadata record. + * + * @param partitionName - Partition name + * @param baseFileName - Base file name for which the bloom filter needs to persisted + * @param timestamp - Instant timestamp responsible for this record + * @param bloomFilter - Bloom filter for the File + * @param isDeleted - Is the bloom filter no more valid + * @return Metadata payload containing the fileID and its bloom filter record + */ + public static HoodieRecord createBloomFilterMetadataRecord(final String partitionName, + final String baseFileName, + final String timestamp, + final ByteBuffer bloomFilter, + final boolean isDeleted) { + ValidationUtils.checkArgument(!baseFileName.contains(Path.SEPARATOR) + && FSUtils.isBaseFile(new Path(baseFileName)), + "Invalid base file '" + baseFileName + "' for MetaIndexBloomFilter!"); + final String bloomFilterIndexKey = new PartitionIndexID(partitionName).asBase64EncodedString() + .concat(new FileIndexID(baseFileName).asBase64EncodedString()); + HoodieKey key = new HoodieKey(bloomFilterIndexKey, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath()); + + // TODO: HUDI-3203 Get the bloom filter type from the file + HoodieMetadataBloomFilter metadataBloomFilter = + new HoodieMetadataBloomFilter(BloomFilterTypeCode.DYNAMIC_V0.name(), + timestamp, bloomFilter, isDeleted); + HoodieMetadataPayload metadataPayload = new HoodieMetadataPayload(key.getRecordKey(), + metadataBloomFilter); + return new HoodieAvroRecord<>(key, metadataPayload); } @Override public HoodieMetadataPayload preCombine(HoodieMetadataPayload previousRecord) { ValidationUtils.checkArgument(previousRecord.type == type, - "Cannot combine " + previousRecord.type + " with " + type); - - Map combinedFileInfo = null; + "Cannot combine " + previousRecord.type + " with " + type); switch (type) { - case PARTITION_LIST: - case FILE_LIST: - combinedFileInfo = combineFilesystemMetadata(previousRecord); - break; + case METADATA_TYPE_PARTITION_LIST: + case METADATA_TYPE_FILE_LIST: + Map combinedFileInfo = combineFilesystemMetadata(previousRecord); + return new HoodieMetadataPayload(key, type, combinedFileInfo); + case METADATA_TYPE_BLOOM_FILTER: + HoodieMetadataBloomFilter combineBloomFilterMetadata = combineBloomFilterMetadata(previousRecord); + return new HoodieMetadataPayload(key, combineBloomFilterMetadata); + case METADATA_TYPE_COLUMN_STATS: + return new HoodieMetadataPayload(key, combineColumnStatsMetadata(previousRecord)); default: throw new HoodieMetadataException("Unknown type of HoodieMetadataPayload: " + type); } + } + + private HoodieMetadataBloomFilter combineBloomFilterMetadata(HoodieMetadataPayload previousRecord) { + return this.bloomFilterMetadata; + } - return new HoodieMetadataPayload(key, type, combinedFileInfo); + private HoodieMetadataColumnStats combineColumnStatsMetadata(HoodieMetadataPayload previousRecord) { + return this.columnStatMetadata; } @Override - public Option combineAndGetUpdateValue(IndexedRecord oldRecord, Schema schema) throws IOException { - HoodieMetadataPayload anotherPayload = new HoodieMetadataPayload(Option.of((GenericRecord)oldRecord)); + public Option combineAndGetUpdateValue(IndexedRecord oldRecord, Schema schema, Properties properties) throws IOException { + HoodieMetadataPayload anotherPayload = new HoodieMetadataPayload(Option.of((GenericRecord) oldRecord)); HoodieRecordPayload combinedPayload = preCombine(anotherPayload); - return combinedPayload.getInsertValue(schema); + return combinedPayload.getInsertValue(schema, properties); } @Override - public Option getInsertValue(Schema schema) throws IOException { + public Option combineAndGetUpdateValue(IndexedRecord oldRecord, Schema schema) throws IOException { + return combineAndGetUpdateValue(oldRecord, schema, new Properties()); + } + + @Override + public Option getInsertValue(Schema schema, Properties properties) throws IOException { if (key == null) { return Option.empty(); } - HoodieMetadataRecord record = new HoodieMetadataRecord(key, type, filesystemMetadata); + HoodieMetadataRecord record = new HoodieMetadataRecord(key, type, filesystemMetadata, bloomFilterMetadata, + columnStatMetadata); return Option.of(record); } + @Override + public Option getInsertValue(Schema schema) throws IOException { + return getInsertValue(schema, new Properties()); + } + /** * Returns the list of filenames added as part of this record. */ @@ -186,6 +350,28 @@ public List getDeletions() { return filterFileInfoEntries(true).map(Map.Entry::getKey).sorted().collect(Collectors.toList()); } + /** + * Get the bloom filter metadata from this payload. + */ + public Option getBloomFilterMetadata() { + if (bloomFilterMetadata == null) { + return Option.empty(); + } + + return Option.of(bloomFilterMetadata); + } + + /** + * Get the bloom filter metadata from this payload. + */ + public Option getColumnStatMetadata() { + if (columnStatMetadata == null) { + return Option.empty(); + } + + return Option.of(columnStatMetadata); + } + /** * Returns the files added as part of this record. */ @@ -234,14 +420,101 @@ private Map combineFilesystemMetadata(HoodieMeta return combinedFileInfo; } + /** + * Get bloom filter index key. + * + * @param partitionIndexID - Partition index id + * @param fileIndexID - File index id + * @return Bloom filter index key + */ + public static String getBloomFilterIndexKey(PartitionIndexID partitionIndexID, FileIndexID fileIndexID) { + return partitionIndexID.asBase64EncodedString() + .concat(fileIndexID.asBase64EncodedString()); + } + + /** + * Get column stats index key. + * + * @param partitionIndexID - Partition index id + * @param fileIndexID - File index id + * @param columnIndexID - Column index id + * @return Column stats index key + */ + public static String getColumnStatsIndexKey(PartitionIndexID partitionIndexID, FileIndexID fileIndexID, ColumnIndexID columnIndexID) { + return columnIndexID.asBase64EncodedString() + .concat(partitionIndexID.asBase64EncodedString()) + .concat(fileIndexID.asBase64EncodedString()); + } + + /** + * Get column stats index key from the column range metadata. + * + * @param partitionName - Partition name + * @param columnRangeMetadata - Column range metadata + * @return Column stats index key + */ + public static String getColumnStatsIndexKey(String partitionName, HoodieColumnRangeMetadata columnRangeMetadata) { + final PartitionIndexID partitionIndexID = new PartitionIndexID(partitionName); + final FileIndexID fileIndexID = new FileIndexID(new Path(columnRangeMetadata.getFilePath()).getName()); + final ColumnIndexID columnIndexID = new ColumnIndexID(columnRangeMetadata.getColumnName()); + return getColumnStatsIndexKey(partitionIndexID, fileIndexID, columnIndexID); + } + + public static Stream createColumnStatsRecords( + String partitionName, Collection> columnRangeMetadataList, boolean isDeleted) { + return columnRangeMetadataList.stream().map(columnRangeMetadata -> { + HoodieKey key = new HoodieKey(getColumnStatsIndexKey(partitionName, columnRangeMetadata), + MetadataPartitionType.COLUMN_STATS.getPartitionPath()); + HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), + HoodieMetadataColumnStats.newBuilder() + .setFileName(new Path(columnRangeMetadata.getFilePath()).getName()) + .setMinValue(columnRangeMetadata.getMinValue() == null ? null : + columnRangeMetadata.getMinValue().toString()) + .setMaxValue(columnRangeMetadata.getMaxValue() == null ? null : + columnRangeMetadata.getMaxValue().toString()) + .setNullCount(columnRangeMetadata.getNullCount()) + .setValueCount(columnRangeMetadata.getValueCount()) + .setTotalSize(columnRangeMetadata.getTotalSize()) + .setTotalUncompressedSize(columnRangeMetadata.getTotalUncompressedSize()) + .setIsDeleted(isDeleted) + .build()); + return new HoodieAvroRecord<>(key, payload); + }); + + + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("HoodieMetadataPayload {"); - sb.append(SCHEMA_FIELD_ID_KEY + "=").append(key).append(", "); - sb.append(SCHEMA_FIELD_ID_TYPE + "=").append(type).append(", "); + sb.append(KEY_FIELD_NAME + "=").append(key).append(", "); + sb.append(SCHEMA_FIELD_NAME_TYPE + "=").append(type).append(", "); sb.append("creations=").append(Arrays.toString(getFilenames().toArray())).append(", "); sb.append("deletions=").append(Arrays.toString(getDeletions().toArray())).append(", "); + if (type == METADATA_TYPE_BLOOM_FILTER) { + checkState(getBloomFilterMetadata().isPresent()); + sb.append("BloomFilter: {"); + sb.append("bloom size: " + getBloomFilterMetadata().get().getBloomFilter().array().length).append(", "); + sb.append("timestamp: " + getBloomFilterMetadata().get().getTimestamp()).append(", "); + sb.append("deleted: " + getBloomFilterMetadata().get().getIsDeleted()); + sb.append("}"); + } + if (type == METADATA_TYPE_COLUMN_STATS) { + checkState(getColumnStatMetadata().isPresent()); + sb.append("ColStats: {"); + sb.append(getColumnStatMetadata().get()); + sb.append("}"); + } sb.append('}'); return sb.toString(); } + + private static T getNestedFieldValue(GenericRecord record, String fieldName) { + // NOTE: This routine is more lightweight than {@code HoodieAvroUtils.getNestedFieldVal} + if (record.getSchema().getField(fieldName) == null) { + return null; + } + + return unsafeCast(record.get(fieldName)); + } } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java index d981b7085195b..52fdbd993627f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadata.java @@ -18,19 +18,25 @@ package org.apache.hudi.metadata; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.avro.model.HoodieMetadataColumnStats; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.util.Option; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieMetadataException; import java.io.IOException; import java.io.Serializable; +import java.nio.ByteBuffer; import java.util.List; import java.util.Map; +import static org.apache.hudi.common.util.ValidationUtils.checkArgument; + /** * Interface that supports querying various pieces of metadata about a hudi table. */ @@ -54,12 +60,19 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable { static final String METADATA_TABLE_REL_PATH = HoodieTableMetaClient.METAFOLDER_NAME + Path.SEPARATOR + "metadata"; /** - * Return the base path of the Metadata Table. - * - * @param tableBasePath The base path of the dataset + * Return the base-path of the Metadata Table for the given Dataset identified by base-path */ - static String getMetadataTableBasePath(String tableBasePath) { - return tableBasePath + Path.SEPARATOR + METADATA_TABLE_REL_PATH; + static String getMetadataTableBasePath(String dataTableBasePath) { + return dataTableBasePath + Path.SEPARATOR + METADATA_TABLE_REL_PATH; + } + + /** + * Returns the base path of the Dataset provided the base-path of the Metadata Table of this + * Dataset + */ + static String getDataTableBasePathFromMetadataTable(String metadataTableBasePath) { + checkArgument(isMetadataTable(metadataTableBasePath)); + return metadataTableBasePath.substring(0, metadataTableBasePath.lastIndexOf(METADATA_TABLE_REL_PATH) - 1); } /** @@ -104,6 +117,38 @@ static HoodieTableMetadata create(HoodieEngineContext engineContext, HoodieMetad */ Map getAllFilesInPartitions(List partitionPaths) throws IOException; + /** + * Get the bloom filter for the FileID from the metadata table. + * + * @param partitionName - Partition name + * @param fileName - File name for which bloom filter needs to be retrieved + * @return BloomFilter byte buffer if available, otherwise empty + * @throws HoodieMetadataException + */ + Option getBloomFilter(final String partitionName, final String fileName) + throws HoodieMetadataException; + + /** + * Get bloom filters for files from the metadata table index. + * + * @param partitionNameFileNameList - List of partition and file name pair for which bloom filters need to be retrieved + * @return Map of partition file name pair to its bloom filter byte buffer + * @throws HoodieMetadataException + */ + Map, ByteBuffer> getBloomFilters(final List> partitionNameFileNameList) + throws HoodieMetadataException; + + /** + * Get column stats for files from the metadata table index. + * + * @param partitionNameFileNameList - List of partition and file name pair for which bloom filters need to be retrieved + * @param columnName - Column name for which stats are needed + * @return Map of partition and file name pair to its column stats + * @throws HoodieMetadataException + */ + Map, HoodieMetadataColumnStats> getColumnStats(final List> partitionNameFileNameList, final String columnName) + throws HoodieMetadataException; + /** * Get the instant time to which the metadata is synced w.r.t data timeline. */ diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 58d63a194e81d..e569baefb6f06 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -21,26 +21,42 @@ import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.data.HoodieData; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieDeltaWriteStat; +import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetUtils; import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -62,12 +78,17 @@ public class HoodieTableMetadataUtil { private static final Logger LOG = LogManager.getLogger(HoodieTableMetadataUtil.class); + protected static final String PARTITION_NAME_FILES = "files"; + protected static final String PARTITION_NAME_COLUMN_STATS = "column_stats"; + protected static final String PARTITION_NAME_BLOOM_FILTERS = "bloom_filters"; + /** - * Delete the metadata table for the dataset. This will be invoked during upgrade/downgrade operation during which no other + * Delete the metadata table for the dataset. This will be invoked during upgrade/downgrade operation during which + * no other * process should be running. * * @param basePath base path of the dataset - * @param context instance of {@link HoodieEngineContext}. + * @param context instance of {@link HoodieEngineContext}. */ public static void deleteMetadataTable(String basePath, HoodieEngineContext context) { final String metadataTablePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); @@ -79,14 +100,53 @@ public static void deleteMetadataTable(String basePath, HoodieEngineContext cont } } + /** + * Convert commit action to metadata records for the enabled partition types. + * + * @param commitMetadata - Commit action metadata + * @param dataMetaClient - Meta client for the data table + * @param isMetaIndexColumnStatsForAllColumns - Do all columns need meta indexing? + * @param instantTime - Action instant time + * @return Map of partition to metadata records for the commit action + */ + public static Map> convertMetadataToRecords( + HoodieEngineContext context, List enabledPartitionTypes, + HoodieCommitMetadata commitMetadata, HoodieTableMetaClient dataMetaClient, + boolean isMetaIndexColumnStatsForAllColumns, String instantTime) { + final Map> partitionToRecordsMap = new HashMap<>(); + final HoodieData filesPartitionRecordsRDD = context.parallelize( + convertMetadataToFilesPartitionRecords(commitMetadata, instantTime), 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecordsRDD); + + if (enabledPartitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS)) { + final List metadataBloomFilterRecords = convertMetadataToBloomFilterRecords(commitMetadata, + dataMetaClient, instantTime); + if (!metadataBloomFilterRecords.isEmpty()) { + final HoodieData metadataBloomFilterRecordsRDD = context.parallelize(metadataBloomFilterRecords, 1); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, metadataBloomFilterRecordsRDD); + } + } + + if (enabledPartitionTypes.contains(MetadataPartitionType.COLUMN_STATS)) { + final List metadataColumnStats = convertMetadataToColumnStatsRecords(commitMetadata, context, + dataMetaClient, isMetaIndexColumnStatsForAllColumns, instantTime); + if (!metadataColumnStats.isEmpty()) { + final HoodieData metadataColumnStatsRDD = context.parallelize(metadataColumnStats, 1); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, metadataColumnStatsRDD); + } + } + return partitionToRecordsMap; + } + /** * Finds all new files/partitions created as part of commit and creates metadata table records for them. * - * @param commitMetadata - * @param instantTime - * @return a list of metadata table records + * @param commitMetadata - Commit action metadata + * @param instantTime - Commit action instant time + * @return List of metadata table records */ - public static List convertMetadataToRecords(HoodieCommitMetadata commitMetadata, String instantTime) { + public static List convertMetadataToFilesPartitionRecords(HoodieCommitMetadata commitMetadata, + String instantTime) { List records = new LinkedList<>(); List allPartitions = new LinkedList<>(); commitMetadata.getPartitionToWriteStats().forEach((partitionStatName, writeStats) -> { @@ -124,6 +184,102 @@ public static List convertMetadataToRecords(HoodieCommitMetadata c return records; } + /** + * Convert commit action metadata to bloom filter records. + * + * @param commitMetadata - Commit action metadata + * @param dataMetaClient - Meta client for the data table + * @param instantTime - Action instant time + * @return List of metadata table records + */ + public static List convertMetadataToBloomFilterRecords(HoodieCommitMetadata commitMetadata, + HoodieTableMetaClient dataMetaClient, + String instantTime) { + List records = new LinkedList<>(); + commitMetadata.getPartitionToWriteStats().forEach((partitionStatName, writeStats) -> { + final String partition = partitionStatName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionStatName; + Map newFiles = new HashMap<>(writeStats.size()); + writeStats.forEach(hoodieWriteStat -> { + // No action for delta logs + if (hoodieWriteStat instanceof HoodieDeltaWriteStat) { + return; + } + + String pathWithPartition = hoodieWriteStat.getPath(); + if (pathWithPartition == null) { + // Empty partition + LOG.error("Failed to find path in write stat to update metadata table " + hoodieWriteStat); + return; + } + int offset = partition.equals(NON_PARTITIONED_NAME) ? (pathWithPartition.startsWith("/") ? 1 : 0) : + partition.length() + 1; + + final String fileName = pathWithPartition.substring(offset); + if (!FSUtils.isBaseFile(new Path(fileName))) { + return; + } + ValidationUtils.checkState(!newFiles.containsKey(fileName), "Duplicate files in HoodieCommitMetadata"); + + final Path writeFilePath = new Path(dataMetaClient.getBasePath(), pathWithPartition); + try { + HoodieFileReader fileReader = + HoodieFileReaderFactory.getFileReader(dataMetaClient.getHadoopConf(), writeFilePath); + try { + final BloomFilter fileBloomFilter = fileReader.readBloomFilter(); + if (fileBloomFilter == null) { + LOG.error("Failed to read bloom filter for " + writeFilePath); + return; + } + ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes()); + HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord( + partition, fileName, instantTime, bloomByteBuffer, false); + records.add(record); + } catch (Exception e) { + LOG.error("Failed to read bloom filter for " + writeFilePath); + return; + } + fileReader.close(); + } catch (IOException e) { + LOG.error("Failed to get bloom filter for file: " + writeFilePath + ", write stat: " + hoodieWriteStat); + } + }); + }); + + return records; + } + + /** + * Convert the clean action to metadata records. + */ + public static Map> convertMetadataToRecords( + HoodieEngineContext engineContext, List enabledPartitionTypes, + HoodieCleanMetadata cleanMetadata, HoodieTableMetaClient dataMetaClient, String instantTime) { + final Map> partitionToRecordsMap = new HashMap<>(); + final HoodieData filesPartitionRecordsRDD = engineContext.parallelize( + convertMetadataToFilesPartitionRecords(cleanMetadata, instantTime), 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecordsRDD); + + if (enabledPartitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS)) { + final List metadataBloomFilterRecords = convertMetadataToBloomFilterRecords(cleanMetadata, + engineContext, instantTime); + if (!metadataBloomFilterRecords.isEmpty()) { + final HoodieData metadataBloomFilterRecordsRDD = engineContext.parallelize(metadataBloomFilterRecords, 1); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, metadataBloomFilterRecordsRDD); + } + } + + if (enabledPartitionTypes.contains(MetadataPartitionType.COLUMN_STATS)) { + final List metadataColumnStats = convertMetadataToColumnStatsRecords(cleanMetadata, engineContext, + dataMetaClient); + if (!metadataColumnStats.isEmpty()) { + final HoodieData metadataColumnStatsRDD = engineContext.parallelize(metadataColumnStats, 1); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, metadataColumnStatsRDD); + } + } + + return partitionToRecordsMap; + } + /** * Finds all files that were deleted as part of a clean and creates metadata table records for them. * @@ -131,7 +287,8 @@ public static List convertMetadataToRecords(HoodieCommitMetadata c * @param instantTime * @return a list of metadata table records */ - public static List convertMetadataToRecords(HoodieCleanMetadata cleanMetadata, String instantTime) { + public static List convertMetadataToFilesPartitionRecords(HoodieCleanMetadata cleanMetadata, + String instantTime) { List records = new LinkedList<>(); int[] fileDeleteCount = {0}; cleanMetadata.getPartitionMetadata().forEach((partitionName, partitionMetadata) -> { @@ -150,51 +307,191 @@ public static List convertMetadataToRecords(HoodieCleanMetadata cl return records; } + /** + * Convert clean metadata to bloom filter index records. + * + * @param cleanMetadata - Clean action metadata + * @param engineContext - Engine context + * @param instantTime - Clean action instant time + * @return List of bloom filter index records for the clean metadata + */ + public static List convertMetadataToBloomFilterRecords(HoodieCleanMetadata cleanMetadata, + HoodieEngineContext engineContext, + String instantTime) { + List> deleteFileList = new ArrayList<>(); + cleanMetadata.getPartitionMetadata().forEach((partition, partitionMetadata) -> { + // Files deleted from a partition + List deletedFiles = partitionMetadata.getDeletePathPatterns(); + deletedFiles.forEach(entry -> { + final Path deletedFilePath = new Path(entry); + if (FSUtils.isBaseFile(deletedFilePath)) { + deleteFileList.add(Pair.of(partition, deletedFilePath.getName())); + } + }); + }); + + return engineContext.map(deleteFileList, deleteFileInfo -> { + return HoodieMetadataPayload.createBloomFilterMetadataRecord( + deleteFileInfo.getLeft(), deleteFileInfo.getRight(), instantTime, ByteBuffer.allocate(0), true); + }, 1).stream().collect(Collectors.toList()); + } + + /** + * Convert clean metadata to column stats index records. + * + * @param cleanMetadata - Clean action metadata + * @param engineContext - Engine context + * @param datasetMetaClient - data table meta client + * @return List of column stats index records for the clean metadata + */ + public static List convertMetadataToColumnStatsRecords(HoodieCleanMetadata cleanMetadata, + HoodieEngineContext engineContext, + HoodieTableMetaClient datasetMetaClient) { + List> deleteFileList = new ArrayList<>(); + cleanMetadata.getPartitionMetadata().forEach((partition, partitionMetadata) -> { + // Files deleted from a partition + List deletedFiles = partitionMetadata.getDeletePathPatterns(); + deletedFiles.forEach(entry -> deleteFileList.add(Pair.of(partition, entry))); + }); + + List latestColumns = getLatestColumns(datasetMetaClient); + return engineContext.flatMap(deleteFileList, + deleteFileInfo -> { + if (deleteFileInfo.getRight().endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + return getColumnStats(deleteFileInfo.getKey(), deleteFileInfo.getValue(), datasetMetaClient, + latestColumns, true); + } + return Stream.empty(); + }, 1).stream().collect(Collectors.toList()); + } + + /** + * Convert restore action metadata to metadata table records. + */ + public static Map> convertMetadataToRecords( + HoodieEngineContext engineContext, List enabledPartitionTypes, + HoodieActiveTimeline metadataTableTimeline, HoodieRestoreMetadata restoreMetadata, + HoodieTableMetaClient dataMetaClient, String instantTime, Option lastSyncTs) { + final Map> partitionToRecordsMap = new HashMap<>(); + final Map> partitionToAppendedFiles = new HashMap<>(); + final Map> partitionToDeletedFiles = new HashMap<>(); + + processRestoreMetadata(metadataTableTimeline, restoreMetadata, + partitionToAppendedFiles, partitionToDeletedFiles, lastSyncTs); + + final HoodieData filesPartitionRecordsRDD = engineContext.parallelize( + convertFilesToFilesPartitionRecords(partitionToDeletedFiles, + partitionToAppendedFiles, instantTime, "Restore"), 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecordsRDD); + + if (enabledPartitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS)) { + final List metadataBloomFilterRecords = convertFilesToBloomFilterRecords( + engineContext, dataMetaClient, partitionToDeletedFiles, partitionToAppendedFiles, instantTime); + if (!metadataBloomFilterRecords.isEmpty()) { + final HoodieData metadataBloomFilterRecordsRDD = engineContext.parallelize(metadataBloomFilterRecords, 1); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, metadataBloomFilterRecordsRDD); + } + } + + if (enabledPartitionTypes.contains(MetadataPartitionType.COLUMN_STATS)) { + final List metadataColumnStats = convertFilesToColumnStatsRecords( + engineContext, dataMetaClient, partitionToDeletedFiles, partitionToAppendedFiles, instantTime); + if (!metadataColumnStats.isEmpty()) { + final HoodieData metadataColumnStatsRDD = engineContext.parallelize(metadataColumnStats, 1); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, metadataColumnStatsRDD); + } + } + + return partitionToRecordsMap; + } + /** * Aggregates all files deleted and appended to from all rollbacks associated with a restore operation then * creates metadata table records for them. * - * @param restoreMetadata - * @param instantTime + * @param restoreMetadata - Restore action metadata * @return a list of metadata table records */ - public static List convertMetadataToRecords(HoodieActiveTimeline metadataTableTimeline, - HoodieRestoreMetadata restoreMetadata, String instantTime, Option lastSyncTs) { - Map> partitionToAppendedFiles = new HashMap<>(); - Map> partitionToDeletedFiles = new HashMap<>(); + private static void processRestoreMetadata(HoodieActiveTimeline metadataTableTimeline, + HoodieRestoreMetadata restoreMetadata, + Map> partitionToAppendedFiles, + Map> partitionToDeletedFiles, + Option lastSyncTs) { restoreMetadata.getHoodieRestoreMetadata().values().forEach(rms -> { - rms.forEach(rm -> processRollbackMetadata(metadataTableTimeline, rm, partitionToDeletedFiles, partitionToAppendedFiles, lastSyncTs)); + rms.forEach(rm -> processRollbackMetadata(metadataTableTimeline, rm, + partitionToDeletedFiles, partitionToAppendedFiles, lastSyncTs)); }); - - return convertFilesToRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Restore"); } - public static List convertMetadataToRecords(HoodieActiveTimeline metadataTableTimeline, - HoodieRollbackMetadata rollbackMetadata, String instantTime, - Option lastSyncTs, boolean wasSynced) { + /** + * Convert rollback action metadata to metadata table records. + */ + public static Map> convertMetadataToRecords( + HoodieEngineContext engineContext, List enabledPartitionTypes, + HoodieActiveTimeline metadataTableTimeline, HoodieRollbackMetadata rollbackMetadata, + HoodieTableMetaClient dataMetaClient, String instantTime, Option lastSyncTs, boolean wasSynced) { + final Map> partitionToRecordsMap = new HashMap<>(); - Map> partitionToAppendedFiles = new HashMap<>(); Map> partitionToDeletedFiles = new HashMap<>(); - processRollbackMetadata(metadataTableTimeline, rollbackMetadata, partitionToDeletedFiles, partitionToAppendedFiles, lastSyncTs); + Map> partitionToAppendedFiles = new HashMap<>(); + List filesPartitionRecords = convertMetadataToRollbackRecords(metadataTableTimeline, rollbackMetadata, + partitionToDeletedFiles, partitionToAppendedFiles, instantTime, lastSyncTs, wasSynced); + final HoodieData rollbackRecordsRDD = engineContext.parallelize(filesPartitionRecords, 1); + partitionToRecordsMap.put(MetadataPartitionType.FILES, rollbackRecordsRDD); + + if (enabledPartitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS)) { + final List metadataBloomFilterRecords = convertFilesToBloomFilterRecords( + engineContext, dataMetaClient, partitionToDeletedFiles, partitionToAppendedFiles, instantTime); + if (!metadataBloomFilterRecords.isEmpty()) { + final HoodieData metadataBloomFilterRecordsRDD = engineContext.parallelize(metadataBloomFilterRecords, 1); + partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, metadataBloomFilterRecordsRDD); + } + } + + if (enabledPartitionTypes.contains(MetadataPartitionType.COLUMN_STATS)) { + final List metadataColumnStats = convertFilesToColumnStatsRecords( + engineContext, dataMetaClient, partitionToDeletedFiles, partitionToAppendedFiles, instantTime); + if (!metadataColumnStats.isEmpty()) { + final HoodieData metadataColumnStatsRDD = engineContext.parallelize(metadataColumnStats, 1); + partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, metadataColumnStatsRDD); + } + } + + return partitionToRecordsMap; + } + + /** + * Convert rollback action metadata to files partition records. + */ + private static List convertMetadataToRollbackRecords(HoodieActiveTimeline metadataTableTimeline, + HoodieRollbackMetadata rollbackMetadata, + Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + String instantTime, + Option lastSyncTs, boolean wasSynced) { + processRollbackMetadata(metadataTableTimeline, rollbackMetadata, partitionToDeletedFiles, + partitionToAppendedFiles, lastSyncTs); if (!wasSynced) { // Since the instant-being-rolled-back was never committed to the metadata table, the files added there // need not be deleted. For MOR Table, the rollback appends logBlocks so we need to keep the appended files. partitionToDeletedFiles.clear(); } - return convertFilesToRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Rollback"); + return convertFilesToFilesPartitionRecords(partitionToDeletedFiles, partitionToAppendedFiles, instantTime, "Rollback"); } /** * Extracts information about the deleted and append files from the {@code HoodieRollbackMetadata}. - * + *

* During a rollback files may be deleted (COW, MOR) or rollback blocks be appended (MOR only) to files. This * function will extract this change file for each partition. - * @param metadataTableTimeline Current timeline of the Metdata Table - * @param rollbackMetadata {@code HoodieRollbackMetadata} - * @param partitionToDeletedFiles The {@code Map} to fill with files deleted per partition. + * + * @param metadataTableTimeline Current timeline of the Metadata Table + * @param rollbackMetadata {@code HoodieRollbackMetadata} + * @param partitionToDeletedFiles The {@code Map} to fill with files deleted per partition. * @param partitionToAppendedFiles The {@code Map} to fill with files appended per partition and their sizes. */ - private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTimeline, HoodieRollbackMetadata rollbackMetadata, + private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTimeline, + HoodieRollbackMetadata rollbackMetadata, Map> partitionToDeletedFiles, Map> partitionToAppendedFiles, Option lastSyncTs) { @@ -264,23 +561,15 @@ private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTi partitionToAppendedFiles.get(partition).merge(new Path(path).getName(), size, fileMergeFn); }); } - - if (pm.getWrittenLogFiles() != null && !pm.getWrittenLogFiles().isEmpty()) { - if (!partitionToAppendedFiles.containsKey(partition)) { - partitionToAppendedFiles.put(partition, new HashMap<>()); - } - - // Extract appended file name from the absolute paths saved in getWrittenLogFiles() - pm.getWrittenLogFiles().forEach((path, size) -> { - partitionToAppendedFiles.get(partition).merge(new Path(path).getName(), size, fileMergeFn); - }); - } }); } - private static List convertFilesToRecords(Map> partitionToDeletedFiles, - Map> partitionToAppendedFiles, String instantTime, - String operation) { + /** + * Convert rollback action metadata to files partition records. + */ + private static List convertFilesToFilesPartitionRecords(Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + String instantTime, String operation) { List records = new LinkedList<>(); int[] fileChangeCount = {0, 0}; // deletes, appends @@ -319,9 +608,88 @@ private static List convertFilesToRecords(Map return records; } + /** + * Convert rollback action metadata to bloom filter index records. + */ + private static List convertFilesToBloomFilterRecords(HoodieEngineContext engineContext, + HoodieTableMetaClient dataMetaClient, + Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + String instantTime) { + List records = new LinkedList<>(); + partitionToDeletedFiles.forEach((partitionName, deletedFileList) -> deletedFileList.forEach(deletedFile -> { + if (!FSUtils.isBaseFile(new Path(deletedFile))) { + return; + } + + final String partition = partitionName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionName; + records.add(HoodieMetadataPayload.createBloomFilterMetadataRecord( + partition, deletedFile, instantTime, ByteBuffer.allocate(0), true)); + })); + + partitionToAppendedFiles.forEach((partitionName, appendedFileMap) -> { + final String partition = partitionName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionName; + appendedFileMap.forEach((appendedFile, length) -> { + if (!FSUtils.isBaseFile(new Path(appendedFile))) { + return; + } + final String pathWithPartition = partitionName + "/" + appendedFile; + final Path appendedFilePath = new Path(dataMetaClient.getBasePath(), pathWithPartition); + try { + HoodieFileReader fileReader = + HoodieFileReaderFactory.getFileReader(dataMetaClient.getHadoopConf(), appendedFilePath); + final BloomFilter fileBloomFilter = fileReader.readBloomFilter(); + if (fileBloomFilter == null) { + LOG.error("Failed to read bloom filter for " + appendedFilePath); + return; + } + ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes()); + HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord( + partition, appendedFile, instantTime, bloomByteBuffer, false); + records.add(record); + fileReader.close(); + } catch (IOException e) { + LOG.error("Failed to get bloom filter for file: " + appendedFilePath); + } + }); + }); + return records; + } + + /** + * Convert rollback action metadata to column stats index records. + */ + private static List convertFilesToColumnStatsRecords(HoodieEngineContext engineContext, + HoodieTableMetaClient datasetMetaClient, + Map> partitionToDeletedFiles, + Map> partitionToAppendedFiles, + String instantTime) { + List records = new LinkedList<>(); + List latestColumns = getLatestColumns(datasetMetaClient); + partitionToDeletedFiles.forEach((partitionName, deletedFileList) -> deletedFileList.forEach(deletedFile -> { + final String partition = partitionName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionName; + if (deletedFile.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + final String filePathWithPartition = partitionName + "/" + deletedFile; + records.addAll(getColumnStats(partition, filePathWithPartition, datasetMetaClient, + latestColumns, true).collect(Collectors.toList())); + } + })); + + partitionToAppendedFiles.forEach((partitionName, appendedFileMap) -> appendedFileMap.forEach( + (appendedFile, size) -> { + final String partition = partitionName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionName; + if (appendedFile.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + final String filePathWithPartition = partitionName + "/" + appendedFile; + records.addAll(getColumnStats(partition, filePathWithPartition, datasetMetaClient, + latestColumns, false).collect(Collectors.toList())); + } + })); + return records; + } + /** * Map a record key to a file group in partition of interest. - * + *

* Note: For hashing, the algorithm is same as String.hashCode() but is being defined here as hashCode() * implementation is not guaranteed by the JVM to be consistent across JVM versions and implementations. * @@ -349,7 +717,7 @@ public static int mapRecordKeyToFileGroupIndex(String recordKey, int numFileGrou */ public static List getPartitionLatestMergedFileSlices(HoodieTableMetaClient metaClient, String partition) { LOG.info("Loading latest merged file slices for metadata table partition " + partition); - return getPartitionFileSlices(metaClient, partition, true); + return getPartitionFileSlices(metaClient, Option.empty(), partition, true); } /** @@ -357,27 +725,23 @@ public static List getPartitionLatestMergedFileSlices(HoodieTableMeta * returned is sorted in the correct order of file group name. * * @param metaClient - Instance of {@link HoodieTableMetaClient}. + * @param fsView - Metadata table filesystem view * @param partition - The name of the partition whose file groups are to be loaded. * @return List of latest file slices for all file groups in a given partition. */ - public static List getPartitionLatestFileSlices(HoodieTableMetaClient metaClient, String partition) { + public static List getPartitionLatestFileSlices(HoodieTableMetaClient metaClient, + Option fsView, String partition) { LOG.info("Loading latest file slices for metadata table partition " + partition); - return getPartitionFileSlices(metaClient, partition, false); + return getPartitionFileSlices(metaClient, fsView, partition, false); } /** - * Get the latest file slices for a given partition. + * Get metadata table file system view. * - * @param metaClient - Instance of {@link HoodieTableMetaClient}. - * @param partition - The name of the partition whose file groups are to be loaded. - * @param mergeFileSlices - When enabled, will merge the latest file slices with the last known - * completed instant. This is useful for readers when there are pending - * compactions. MergeFileSlices when disabled, will return the latest file - * slices without any merging, and this is needed for the writers. - * @return List of latest file slices for all file groups in a given partition. + * @param metaClient - Metadata table meta client + * @return Filesystem view for the metadata table */ - private static List getPartitionFileSlices(HoodieTableMetaClient metaClient, String partition, - boolean mergeFileSlices) { + public static HoodieTableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient) { // If there are no commits on the metadata table then the table's // default FileSystemView will not return any file slices even // though we may have initialized them. @@ -387,16 +751,175 @@ private static List getPartitionFileSlices(HoodieTableMetaClient meta HoodieActiveTimeline.createNewInstantTime()); timeline = new HoodieDefaultTimeline(Arrays.asList(instant).stream(), metaClient.getActiveTimeline()::getInstantDetails); } + return new HoodieTableFileSystemView(metaClient, timeline); + } - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, timeline); + /** + * Get the latest file slices for a given partition. + * + * @param metaClient - Instance of {@link HoodieTableMetaClient}. + * @param partition - The name of the partition whose file groups are to be loaded. + * @param mergeFileSlices - When enabled, will merge the latest file slices with the last known + * completed instant. This is useful for readers when there are pending + * compactions. MergeFileSlices when disabled, will return the latest file + * slices without any merging, and this is needed for the writers. + * @return List of latest file slices for all file groups in a given partition. + */ + private static List getPartitionFileSlices(HoodieTableMetaClient metaClient, + Option fileSystemView, + String partition, + boolean mergeFileSlices) { + HoodieTableFileSystemView fsView = fileSystemView.orElse(getFileSystemView(metaClient)); Stream fileSliceStream; if (mergeFileSlices) { fileSliceStream = fsView.getLatestMergedFileSlicesBeforeOrOn( - partition, timeline.filterCompletedInstants().lastInstant().get().getTimestamp()); + partition, metaClient.getActiveTimeline().filterCompletedInstants().lastInstant().get().getTimestamp()); } else { fileSliceStream = fsView.getLatestFileSlices(partition); } return fileSliceStream.sorted((s1, s2) -> s1.getFileId().compareTo(s2.getFileId())).collect(Collectors.toList()); } + public static List convertMetadataToColumnStatsRecords(HoodieCommitMetadata commitMetadata, + HoodieEngineContext engineContext, + HoodieTableMetaClient dataMetaClient, + boolean isMetaIndexColumnStatsForAllColumns, + String instantTime) { + + try { + List allWriteStats = commitMetadata.getPartitionToWriteStats().values().stream() + .flatMap(entry -> entry.stream()).collect(Collectors.toList()); + return HoodieTableMetadataUtil.createColumnStatsFromWriteStats(engineContext, dataMetaClient, allWriteStats, + isMetaIndexColumnStatsForAllColumns); + } catch (Exception e) { + throw new HoodieException("Failed to generate column stats records for metadata table ", e); + } + } + + /** + * Create column stats from write status. + * + * @param engineContext - Engine context + * @param datasetMetaClient - Dataset meta client + * @param allWriteStats - Write status to convert + * @param isMetaIndexColumnStatsForAllColumns - Are all columns enabled for indexing + */ + public static List createColumnStatsFromWriteStats(HoodieEngineContext engineContext, + HoodieTableMetaClient datasetMetaClient, + List allWriteStats, + boolean isMetaIndexColumnStatsForAllColumns) throws Exception { + if (allWriteStats.isEmpty()) { + return Collections.emptyList(); + } + + List prunedWriteStats = allWriteStats.stream().filter(writeStat -> { + return !(writeStat instanceof HoodieDeltaWriteStat); + }).collect(Collectors.toList()); + if (prunedWriteStats.isEmpty()) { + return Collections.emptyList(); + } + + return engineContext.flatMap(prunedWriteStats, + writeStat -> translateWriteStatToColumnStats(writeStat, datasetMetaClient, + getLatestColumns(datasetMetaClient, isMetaIndexColumnStatsForAllColumns)), + prunedWriteStats.size()); + } + + /** + * Get the latest columns for the table for column stats indexing. + * + * @param datasetMetaClient - Data table meta client + * @param isMetaIndexColumnStatsForAllColumns - Is column stats indexing enabled for all columns + */ + private static List getLatestColumns(HoodieTableMetaClient datasetMetaClient, boolean isMetaIndexColumnStatsForAllColumns) { + if (!isMetaIndexColumnStatsForAllColumns + || datasetMetaClient.getCommitsTimeline().filterCompletedInstants().countInstants() < 1) { + return Collections.singletonList(datasetMetaClient.getTableConfig().getRecordKeyFieldProp()); + } + + TableSchemaResolver schemaResolver = new TableSchemaResolver(datasetMetaClient); + // consider nested fields as well. if column stats is enabled only for a subset of columns, + // directly use them instead of all columns from the latest table schema + try { + return schemaResolver.getTableAvroSchema().getFields().stream() + .map(entry -> entry.name()).collect(Collectors.toList()); + } catch (Exception e) { + throw new HoodieException("Failed to get latest columns for " + datasetMetaClient.getBasePath()); + } + } + + private static List getLatestColumns(HoodieTableMetaClient datasetMetaClient) { + return getLatestColumns(datasetMetaClient, false); + } + + public static Stream translateWriteStatToColumnStats(HoodieWriteStat writeStat, + HoodieTableMetaClient datasetMetaClient, + List latestColumns) { + return getColumnStats(writeStat.getPartitionPath(), writeStat.getPath(), datasetMetaClient, latestColumns, false); + + } + + private static Stream getColumnStats(final String partitionPath, final String filePathWithPartition, + HoodieTableMetaClient datasetMetaClient, + List columns, boolean isDeleted) { + final String partition = partitionPath.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionPath; + final int offset = partition.equals(NON_PARTITIONED_NAME) ? (filePathWithPartition.startsWith("/") ? 1 : 0) + : partition.length() + 1; + final String fileName = filePathWithPartition.substring(offset); + if (!FSUtils.isBaseFile(new Path(fileName))) { + return Stream.empty(); + } + + if (filePathWithPartition.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { + List> columnRangeMetadataList = new ArrayList<>(); + final Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePathWithPartition); + if (!isDeleted) { + try { + columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata( + datasetMetaClient.getHadoopConf(), fullFilePath, columns); + } catch (Exception e) { + LOG.error("Failed to read column stats for " + fullFilePath, e); + } + } else { + columnRangeMetadataList = + columns.stream().map(entry -> new HoodieColumnRangeMetadata(fileName, + entry, null, null, 0, 0, 0, 0)) + .collect(Collectors.toList()); + } + return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, isDeleted); + } else { + throw new HoodieException("Column range index not supported for filePathWithPartition " + fileName); + } + } + + /** + * Get file group count for a metadata table partition. + * + * @param partitionType - Metadata table partition type + * @param metaClient - Metadata table meta client + * @param fsView - Filesystem view + * @param metadataConfig - Metadata config + * @param isBootstrapCompleted - Is bootstrap completed for the metadata table + * @return File group count for the requested metadata partition type + */ + public static int getPartitionFileGroupCount(final MetadataPartitionType partitionType, + final Option metaClient, + final Option fsView, + final HoodieMetadataConfig metadataConfig, boolean isBootstrapCompleted) { + if (isBootstrapCompleted) { + final List latestFileSlices = HoodieTableMetadataUtil + .getPartitionLatestFileSlices(metaClient.get(), fsView, partitionType.getPartitionPath()); + return Math.max(latestFileSlices.size(), 1); + } + + switch (partitionType) { + case BLOOM_FILTERS: + return metadataConfig.getBloomFilterIndexFileGroupCount(); + case COLUMN_STATS: + return metadataConfig.getColumnStatsIndexFileGroupCount(); + default: + return 1; + } + } + } diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java index 380f4d04d34a6..9fb268e7de1b0 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/MetadataPartitionType.java @@ -22,19 +22,23 @@ import java.util.List; public enum MetadataPartitionType { - FILES("files", "files-"); + FILES(HoodieTableMetadataUtil.PARTITION_NAME_FILES, "files-"), + COLUMN_STATS(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS, "col-stats-"), + BLOOM_FILTERS(HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS, "bloom-filters-"); - // refers to partition path in metadata table. + // Partition path in metadata table. private final String partitionPath; - // refers to fileId prefix used for all file groups in this partition. + // FileId prefix used for all file groups in this partition. private final String fileIdPrefix; + // Total file groups + private int fileGroupCount = 1; - MetadataPartitionType(String partitionPath, String fileIdPrefix) { + MetadataPartitionType(final String partitionPath, final String fileIdPrefix) { this.partitionPath = partitionPath; this.fileIdPrefix = fileIdPrefix; } - public String partitionPath() { + public String getPartitionPath() { return partitionPath; } @@ -42,7 +46,28 @@ public String getFileIdPrefix() { return fileIdPrefix; } - public static List all() { - return Arrays.asList(MetadataPartitionType.FILES.partitionPath()); + void setFileGroupCount(final int fileGroupCount) { + this.fileGroupCount = fileGroupCount; + } + + public int getFileGroupCount() { + return this.fileGroupCount; + } + + public static List allPaths() { + return Arrays.asList( + FILES.getPartitionPath(), + COLUMN_STATS.getPartitionPath(), + BLOOM_FILTERS.getPartitionPath() + ); + } + + @Override + public String toString() { + return "Metadata partition {" + + "name: " + getPartitionPath() + + ", prefix: " + getFileIdPrefix() + + ", groups: " + getFileGroupCount() + + "}"; } } diff --git a/hudi-common/src/main/java/org/apache/hudi/parquet/io/ByteBufferBackedInputFile.java b/hudi-common/src/main/java/org/apache/hudi/parquet/io/ByteBufferBackedInputFile.java new file mode 100644 index 0000000000000..40454d306ac78 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/parquet/io/ByteBufferBackedInputFile.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.parquet.io; + +import org.apache.hudi.common.util.io.ByteBufferBackedInputStream; +import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; + +/** + * Implementation of {@link InputFile} backed by {@code byte[]} buffer + */ +public class ByteBufferBackedInputFile implements InputFile { + private final byte[] buffer; + private final int offset; + private final int length; + + public ByteBufferBackedInputFile(byte[] buffer, int offset, int length) { + this.buffer = buffer; + this.offset = offset; + this.length = length; + } + + public ByteBufferBackedInputFile(byte[] buffer) { + this(buffer, 0, buffer.length); + } + + @Override + public long getLength() { + return length; + } + + @Override + public SeekableInputStream newStream() { + return new DelegatingSeekableInputStream(new ByteBufferBackedInputStream(buffer, offset, length)) { + @Override + public long getPos() { + return ((ByteBufferBackedInputStream) getStream()).getPosition(); + } + + @Override + public void seek(long newPos) { + ((ByteBufferBackedInputStream) getStream()).seek(newPos); + } + }; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/parquet/io/OutputStreamBackedOutputFile.java b/hudi-common/src/main/java/org/apache/hudi/parquet/io/OutputStreamBackedOutputFile.java new file mode 100644 index 0000000000000..48c2c82e7b422 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/parquet/io/OutputStreamBackedOutputFile.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.parquet.io; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.parquet.io.OutputFile; +import org.apache.parquet.io.PositionOutputStream; + +import javax.annotation.Nonnull; +import java.io.IOException; + +/** + * Implementation of the {@link OutputFile} backed by {@link java.io.OutputStream} + */ +public class OutputStreamBackedOutputFile implements OutputFile { + + private static final long DEFAULT_BLOCK_SIZE = 1024L * 1024L; + + private final FSDataOutputStream outputStream; + + public OutputStreamBackedOutputFile(FSDataOutputStream outputStream) { + this.outputStream = outputStream; + } + + @Override + public PositionOutputStream create(long blockSizeHint) { + return new PositionOutputStreamAdapter(outputStream); + } + + @Override + public PositionOutputStream createOrOverwrite(long blockSizeHint) { + return create(blockSizeHint); + } + + @Override + public boolean supportsBlockSize() { + return false; + } + + @Override + public long defaultBlockSize() { + return DEFAULT_BLOCK_SIZE; + } + + private static class PositionOutputStreamAdapter extends PositionOutputStream { + private final FSDataOutputStream delegate; + + PositionOutputStreamAdapter(FSDataOutputStream delegate) { + this.delegate = delegate; + } + + @Override + public long getPos() throws IOException { + return delegate.getPos(); + } + + @Override + public void write(int b) throws IOException { + delegate.write(b); + } + + @Override + public void write(@Nonnull byte[] buffer, int off, int len) throws IOException { + delegate.write(buffer, off, len); + } + + @Override + public void flush() throws IOException { + delegate.flush(); + } + + @Override + public void close() { + // We're deliberately not closing the delegate stream here to allow caller + // to explicitly manage its lifecycle + } + } +} diff --git a/hudi-common/src/main/scala/org/apache/hudi/HoodieTableFileIndexBase.scala b/hudi-common/src/main/scala/org/apache/hudi/HoodieTableFileIndexBase.scala deleted file mode 100644 index f25c7d99d5a5e..0000000000000 --- a/hudi-common/src/main/scala/org/apache/hudi/HoodieTableFileIndexBase.scala +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.hudi.common.config.{HoodieMetadataConfig, TypedProperties} -import org.apache.hudi.common.engine.HoodieEngineContext -import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ -import org.apache.hudi.common.model.{FileSlice, HoodieTableQueryType} -import org.apache.hudi.common.table.HoodieTableMetaClient -import org.apache.hudi.common.table.view.{FileSystemViewStorageConfig, HoodieTableFileSystemView} - -import scala.collection.JavaConverters._ -import scala.collection.JavaConversions._ -import scala.collection.mutable - -/** - * Common (engine-agnostic) File Index implementation enabling individual query engines to - * list Hudi Table contents based on the - * - *

    - *
  • Table type (MOR, COW)
  • - *
  • Query type (snapshot, read_optimized, incremental)
  • - *
  • Query instant/range
  • - *
- * - * @param engineContext Hudi engine-specific context - * @param metaClient Hudi table's meta-client - * @param configProperties unifying configuration (in the form of generic properties) - * @param queryType target query type - * @param queryPaths target DFS paths being queried - * @param specifiedQueryInstant instant as of which table is being queried - * @param shouldIncludePendingCommits flags whether file-index should exclude any pending operations - * @param fileStatusCache transient cache of fetched [[FileStatus]]es - */ -abstract class HoodieTableFileIndexBase(engineContext: HoodieEngineContext, - metaClient: HoodieTableMetaClient, - configProperties: TypedProperties, - queryType: HoodieTableQueryType, - protected val queryPaths: Seq[Path], - specifiedQueryInstant: Option[String] = None, - shouldIncludePendingCommits: Boolean = false, - @transient fileStatusCache: FileStatusCacheTrait) { - /** - * Get all completeCommits. - */ - lazy val completedCommits = metaClient.getCommitsTimeline - .filterCompletedInstants().getInstants.iterator().toList.map(_.getTimestamp) - - private lazy val _partitionColumns: Array[String] = - metaClient.getTableConfig.getPartitionFields.orElse(Array[String]()) - - private lazy val fileSystemStorageConfig = FileSystemViewStorageConfig.newBuilder() - .fromProperties(configProperties) - .build() - private lazy val metadataConfig = HoodieMetadataConfig.newBuilder - .fromProperties(configProperties) - .build() - - private val tableType = metaClient.getTableType - - protected val basePath: String = metaClient.getBasePath - - @transient - @volatile protected var cachedFileSize: Long = 0L - @transient - @volatile protected var cachedAllInputFileSlices: Map[PartitionPath, Seq[FileSlice]] = _ - @volatile protected var queryAsNonePartitionedTable: Boolean = _ - @transient - @volatile private var fileSystemView: HoodieTableFileSystemView = _ - - refresh0() - - /** - * Fetch list of latest base files and log files per partition. - * - * @return mapping from string partition paths to its base/log files - */ - def listFileSlices(): Map[String, Seq[FileSlice]] = { - if (queryAsNonePartitionedTable) { - // Read as Non-Partitioned table. - cachedAllInputFileSlices.map(entry => (entry._1.path, entry._2)) - } else { - cachedAllInputFileSlices.keys.toSeq.map(partition => { - (partition.path, cachedAllInputFileSlices(partition)) - }).toMap - } - } - - private def refresh0(): Unit = { - val startTime = System.currentTimeMillis() - val partitionFiles = loadPartitionPathFiles() - val allFiles = partitionFiles.values.reduceOption(_ ++ _) - .getOrElse(Array.empty[FileStatus]) - - metaClient.reloadActiveTimeline() - - val activeTimeline = getActiveTimeline - val latestInstant = activeTimeline.lastInstant() - // TODO we can optimize the flow by: - // - First fetch list of files from instants of interest - // - Load FileStatus's - fileSystemView = new HoodieTableFileSystemView(metaClient, activeTimeline, allFiles) - val queryInstant = if (specifiedQueryInstant.isDefined) { - specifiedQueryInstant - } else if (latestInstant.isPresent) { - Some(latestInstant.get.getTimestamp) - } else { - None - } - - (tableType, queryType) match { - case (MERGE_ON_READ, HoodieTableQueryType.QUERY_TYPE_SNAPSHOT) => - // Fetch and store latest base and log files, and their sizes - cachedAllInputFileSlices = partitionFiles.map(p => { - val latestSlices = if (queryInstant.isDefined) { - fileSystemView.getLatestMergedFileSlicesBeforeOrOn(p._1.path, queryInstant.get) - .iterator().asScala.toSeq - } else { - Seq() - } - (p._1, latestSlices) - }) - cachedFileSize = cachedAllInputFileSlices.values.flatten.map(fileSlice => { - if (fileSlice.getBaseFile.isPresent) { - fileSlice.getBaseFile.get().getFileLen + fileSlice.getLogFiles.iterator().asScala.map(_.getFileSize).sum - } else { - fileSlice.getLogFiles.iterator().asScala.map(_.getFileSize).sum - } - }).sum - case (_, _) => - // Fetch and store latest base files and its sizes - cachedAllInputFileSlices = partitionFiles.map(p => { - val fileSlices = specifiedQueryInstant - .map(instant => - fileSystemView.getLatestFileSlicesBeforeOrOn(p._1.path, instant, true)) - .getOrElse(fileSystemView.getLatestFileSlices(p._1.path)) - .iterator().asScala.toSeq - (p._1, fileSlices) - }) - cachedFileSize = cachedAllInputFileSlices.values.flatten.map(fileSliceSize).sum - } - - // If the partition value contains InternalRow.empty, we query it as a non-partitioned table. - queryAsNonePartitionedTable = partitionFiles.keys.exists(p => p.values.isEmpty) - val flushSpend = System.currentTimeMillis() - startTime - - logInfo(s"Refresh table ${metaClient.getTableConfig.getTableName}," + - s" spend: $flushSpend ms") - } - - protected def refresh(): Unit = { - fileStatusCache.invalidate() - refresh0() - } - - private def getActiveTimeline = { - val timeline = metaClient.getActiveTimeline.getCommitsTimeline - if (shouldIncludePendingCommits) { - timeline - } else { - timeline.filterCompletedInstants() - } - } - - private def fileSliceSize(fileSlice: FileSlice): Long = { - val logFileSize = fileSlice.getLogFiles.iterator().asScala.map(_.getFileSize).filter(_ > 0).sum - if (fileSlice.getBaseFile.isPresent) { - fileSlice.getBaseFile.get().getFileLen + logFileSize - } else { - logFileSize - } - } - - /** - * Load all partition paths and it's files under the query table path. - */ - private def loadPartitionPathFiles(): Map[PartitionPath, Array[FileStatus]] = { - val partitionPaths = getAllQueryPartitionPaths - // List files in all of the partition path. - val pathToFetch = mutable.ArrayBuffer[PartitionPath]() - val cachePartitionToFiles = mutable.Map[PartitionPath, Array[FileStatus]]() - // Fetch from the FileStatusCache - partitionPaths.foreach { partitionPath => - fileStatusCache.get(partitionPath.fullPartitionPath(basePath)) match { - case Some(filesInPartition) => - cachePartitionToFiles.put(partitionPath, filesInPartition) - - case None => pathToFetch.append(partitionPath) - } - } - - val fetchedPartitionToFiles = - if (pathToFetch.nonEmpty) { - val fullPartitionPathsToFetch = pathToFetch.map(p => (p, p.fullPartitionPath(basePath).toString)).toMap - val partitionToFilesMap = FSUtils.getFilesInPartitions(engineContext, metadataConfig, basePath, - fullPartitionPathsToFetch.values.toArray, fileSystemStorageConfig.getSpillableDir) - fullPartitionPathsToFetch.map(p => { - (p._1, partitionToFilesMap.get(p._2)) - }) - } else { - Map.empty[PartitionPath, Array[FileStatus]] - } - - // Update the fileStatusCache - fetchedPartitionToFiles.foreach { - case (partitionRowPath, filesInPartition) => - fileStatusCache.put(partitionRowPath.fullPartitionPath(basePath), filesInPartition) - } - cachePartitionToFiles.toMap ++ fetchedPartitionToFiles - } - - def getAllQueryPartitionPaths: Seq[PartitionPath] = { - val queryRelativePartitionPaths = queryPaths.map(FSUtils.getRelativePartitionPath(new Path(basePath), _)) - // Load all the partition path from the basePath, and filter by the query partition path. - // TODO load files from the queryRelativePartitionPaths directly. - val partitionPaths = FSUtils.getAllPartitionPaths(engineContext, metadataConfig, basePath).asScala - .filter(path => queryRelativePartitionPaths.exists(path.startsWith)) - - val partitionSchema = _partitionColumns - - // Convert partition's path into partition descriptor - partitionPaths.map { partitionPath => - val partitionColumnValues = parsePartitionColumnValues(partitionSchema, partitionPath) - PartitionPath(partitionPath, partitionColumnValues) - } - } - - /** - * Parses partition columns' values from the provided partition's path, returning list of - * values (that might have engine-specific representation) - * - * @param partitionColumns partitioning columns identifying the partition - * @param partitionPath partition's path to parse partitioning columns' values from - */ - protected def parsePartitionColumnValues(partitionColumns: Array[String], partitionPath: String): Array[Any] - - // TODO eval whether we should just use logger directly - protected def logWarning(str: => String): Unit - protected def logInfo(str: => String): Unit - - /** - * Represents a partition as a tuple of - *
    - *
  • Actual partition path (relative to the table's base path)
  • - *
  • Values of the corresponding columns table is being partitioned by (partitioning columns)
  • - *
- * - * E.g. PartitionPath("2021/02/01", Array("2021","02","01")) - * - * NOTE: Partitioning column values might have engine specific representation (for ex, - * {@code UTF8String} for Spark, etc) and are solely used in partition pruning in an very - * engine-specific ways - * - * @param values values of the corresponding partitioning columns - * @param path partition's path - * - * TODO expose as a trait and make impls engine-specific (current impl is tailored for Spark) - */ - case class PartitionPath(path: String, values: Array[Any]) { - override def equals(other: Any): Boolean = other match { - case PartitionPath(otherPath, _) => path == otherPath - case _ => false - } - - override def hashCode(): Int = { - path.hashCode - } - - def fullPartitionPath(basePath: String): Path = { - if (path.isEmpty) { - new Path(basePath) // This is a non-partition path - } else { - new Path(basePath, path) - } - } - } -} - -trait FileStatusCacheTrait { - def get(path: Path): Option[Array[FileStatus]] - def put(path: Path, leafFiles: Array[FileStatus]): Unit - def invalidate(): Unit -} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java index e4460ce629f40..f51702a447258 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java @@ -68,8 +68,8 @@ public class TestFSUtils extends HoodieCommonTestHarness { private final long minRollbackToKeep = 10; private final long minCleanToKeep = 10; - private static final String TEST_WRITE_TOKEN = "1-0-1"; - private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); + private static String TEST_WRITE_TOKEN = "1-0-1"; + public static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension(); @Rule public final EnvironmentVariables environmentVariables = new EnvironmentVariables(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtilsWithRetryWrapperEnable.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtilsWithRetryWrapperEnable.java new file mode 100644 index 0000000000000..0b849ebec8185 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtilsWithRetryWrapperEnable.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version loop.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-loop.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.fs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.util.Progressable; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.util.Arrays; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * Tests file system utils with retry wrapper enable. + * P.S extends TestFSUtils and setUp a HoodieWrapperFileSystem for metaClient which can test all the TestFSUtils uts with RetryWrapperEnable + */ +public class TestFSUtilsWithRetryWrapperEnable extends TestFSUtils { + + private static final String EXCEPTION_MESSAGE = "Fake runtime exception here."; + private long maxRetryIntervalMs; + private int maxRetryNumbers; + private long initialRetryIntervalMs; + + @Override + @BeforeEach + public void setUp() throws IOException { + initMetaClient(); + basePath = "file:" + basePath; + FileSystemRetryConfig fileSystemRetryConfig = FileSystemRetryConfig.newBuilder().withFileSystemActionRetryEnabled(true).build(); + maxRetryIntervalMs = fileSystemRetryConfig.getMaxRetryIntervalMs(); + maxRetryNumbers = fileSystemRetryConfig.getMaxRetryNumbers(); + initialRetryIntervalMs = fileSystemRetryConfig.getInitialRetryIntervalMs(); + + FakeRemoteFileSystem fakeFs = new FakeRemoteFileSystem(FSUtils.getFs(metaClient.getMetaPath(), metaClient.getHadoopConf()), 2); + FileSystem fileSystem = new HoodieRetryWrapperFileSystem(fakeFs, maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, ""); + + HoodieWrapperFileSystem fs = new HoodieWrapperFileSystem(fileSystem, new NoOpConsistencyGuard()); + metaClient.setFs(fs); + } + + // Test the scenario that fs keeps retrying until it fails. + @Test + public void testProcessFilesWithExceptions() throws Exception { + FakeRemoteFileSystem fakeFs = new FakeRemoteFileSystem(FSUtils.getFs(metaClient.getMetaPath(), metaClient.getHadoopConf()), 100); + FileSystem fileSystem = new HoodieRetryWrapperFileSystem(fakeFs, maxRetryIntervalMs, maxRetryNumbers, initialRetryIntervalMs, ""); + HoodieWrapperFileSystem fs = new HoodieWrapperFileSystem(fileSystem, new NoOpConsistencyGuard()); + metaClient.setFs(fs); + List folders = + Arrays.asList("2016/04/15", ".hoodie/.temp/2/2016/04/15"); + folders.forEach(f -> assertThrows(RuntimeException.class, () -> metaClient.getFs().mkdirs(new Path(new Path(basePath), f)))); + } + + /** + * Fake remote FileSystem which will throw RuntimeException something like AmazonS3Exception 503. + */ + class FakeRemoteFileSystem extends FileSystem { + + private FileSystem fs; + private int count = 1; + private int loop; + + public FakeRemoteFileSystem(FileSystem fs, int retryLoop) { + this.fs = fs; + this.loop = retryLoop; + } + + @Override + public URI getUri() { + return fs.getUri(); + } + + @Override + public FSDataInputStream open(Path f, int bufferSize) throws IOException { + if (count % loop == 0) { + count++; + return fs.open(f, bufferSize); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { + if (count % loop == 0) { + count++; + return fs.create(f, permission, overwrite, bufferSize, replication, blockSize, progress); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { + if (count % loop == 0) { + count++; + return fs.append(f, bufferSize, progress); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public boolean rename(Path src, Path dst) throws IOException { + if (count % loop == 0) { + count++; + return fs.rename(src, dst); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public boolean delete(Path f, boolean recursive) throws IOException { + if (count % loop == 0) { + count++; + return fs.delete(f, recursive); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { + if (count % loop == 0) { + count++; + return fs.listStatus(f); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public void setWorkingDirectory(Path newDir) { + fs.setWorkingDirectory(newDir); + } + + @Override + public Path getWorkingDirectory() { + return fs.getWorkingDirectory(); + } + + @Override + public boolean mkdirs(Path f, FsPermission permission) throws IOException { + if (count % loop == 0) { + count++; + return fs.mkdirs(f, permission); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public FileStatus getFileStatus(Path f) throws IOException { + if (count % loop == 0) { + count++; + return fs.getFileStatus(f); + } else { + count++; + throw new RuntimeException(EXCEPTION_MESSAGE); + } + } + + @Override + public RemoteIterator listLocatedStatus(Path f) throws IOException { + return fs.listLocatedStatus(f); + } + + @Override + public Configuration getConf() { + return fs.getConf(); + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestParquetInLining.java b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestParquetInLining.java index c4e728dc24909..9ed27c4b2d63c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestParquetInLining.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/inline/TestParquetInLining.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.fs.inline; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.FileSystemTestUtils; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; @@ -144,7 +145,8 @@ static List getParquetHoodieRecords() throws IOException { List hoodieRecords = dataGenerator.generateInsertsWithHoodieAvroPayload(commitTime, 10); List toReturn = new ArrayList<>(); for (HoodieRecord record : hoodieRecords) { - toReturn.add((GenericRecord) record.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get()); + toReturn.add((GenericRecord) ((HoodieAvroRecord) record).getData() + .getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA).get()); } return toReturn; } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java index bbfd8cf4ad39b..e9b06e6d6397d 100755 --- a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormat.java @@ -18,16 +18,10 @@ package org.apache.hudi.common.functional; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieArchivedLogFile; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; @@ -46,16 +40,30 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.testutils.FileCreateUtils; +import org.apache.hudi.common.testutils.HadoopMapRedUtils; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.SchemaTestUtil; import org.apache.hudi.common.testutils.minicluster.MiniClusterUtil; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.exception.CorruptedLogFileException; - import org.apache.hudi.exception.HoodieIOException; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.io.compress.Compression; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -96,11 +104,12 @@ @SuppressWarnings("Duplicates") public class TestHoodieLogFormat extends HoodieCommonTestHarness { + private static final HoodieLogBlockType DEFAULT_DATA_BLOCK_TYPE = HoodieLogBlockType.AVRO_DATA_BLOCK; + private static String BASE_OUTPUT_PATH = "/tmp/"; private FileSystem fs; private Path partitionPath; private int bufferSize = 4096; - private HoodieLogBlockType dataBlockType = HoodieLogBlockType.AVRO_DATA_BLOCK; @BeforeAll public static void setUpClass() throws IOException, InterruptedException { @@ -139,7 +148,7 @@ public void testEmptyLog() throws IOException { } @ParameterizedTest - @EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" }) + @EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"}) public void testBasicAppend(HoodieLogBlockType dataBlockType) throws IOException, InterruptedException, URISyntaxException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) @@ -171,7 +180,7 @@ public void testRollover() throws IOException, InterruptedException, URISyntaxEx Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); // Write out a block AppendResult firstAppend = writer.appendBlock(dataBlock); // Get the size of the block @@ -186,7 +195,7 @@ public void testRollover() throws IOException, InterruptedException, URISyntaxEx HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(size - 1).build(); records = SchemaTestUtil.generateTestRecords(0, 100); - dataBlock = getDataBlock(records, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); AppendResult secondAppend = writer.appendBlock(dataBlock); assertEquals(firstAppend.logFile(), secondAppend.logFile()); @@ -198,7 +207,7 @@ public void testRollover() throws IOException, InterruptedException, URISyntaxEx // Write one more block, which should not go to the new log file. records = SchemaTestUtil.generateTestRecords(0, 100); - dataBlock = getDataBlock(records, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); AppendResult rolloverAppend = writer.appendBlock(dataBlock); assertNotEquals(secondAppend.logFile(), rolloverAppend.logFile()); @@ -245,7 +254,7 @@ private void testConcurrentAppend(boolean logFileExists, boolean newLogFileForma Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); Writer writer2 = builder2.build(); writer2.appendBlock(dataBlock); @@ -257,8 +266,9 @@ private void testConcurrentAppend(boolean logFileExists, boolean newLogFileForma assertEquals(logFile1.getLogVersion(), logFile2.getLogVersion() - 1, "Log Files must have different versions"); } - @Test - public void testMultipleAppend() throws IOException, URISyntaxException, InterruptedException { + @ParameterizedTest + @EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"}) + public void testMultipleAppend(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); @@ -266,7 +276,7 @@ public void testMultipleAppend() throws IOException, URISyntaxException, Interru Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records, header); writer.appendBlock(dataBlock); long size1 = writer.getCurrentSize(); writer.close(); @@ -276,7 +286,7 @@ public void testMultipleAppend() throws IOException, URISyntaxException, Interru .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); + dataBlock = getDataBlock(dataBlockType, records, header); writer.appendBlock(dataBlock); long size2 = writer.getCurrentSize(); assertTrue(size2 > size1, "We just wrote a new block - size2 should be > size1"); @@ -290,7 +300,7 @@ public void testMultipleAppend() throws IOException, URISyntaxException, Interru .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); + dataBlock = getDataBlock(dataBlockType, records, header); writer.appendBlock(dataBlock); long size3 = writer.getCurrentSize(); assertTrue(size3 > size2, "We just wrote a new block - size3 should be > size2"); @@ -309,26 +319,27 @@ public void testMultipleAppend() throws IOException, URISyntaxException, Interru * This is actually a test on concurrent append and not recovery lease. Commenting this out. * https://issues.apache.org/jira/browse/HUDI-117 */ + /** * @Test public void testLeaseRecovery() throws IOException, URISyntaxException, InterruptedException { Writer writer - * = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") - * .overBaseCommit("100").withFs(fs).build(); List records = - * SchemaTestUtil.generateTestRecords(0, 100); Map header = - * Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); - * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock - * dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 = - * writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying - * without closing the file // writer.close(); - * - * writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) - * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") - * .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); - * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new - * HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size2 = - * writer.getCurrentSize(); assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1); - * assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", - * size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); } + * = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) + * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") + * .overBaseCommit("100").withFs(fs).build(); List records = + * SchemaTestUtil.generateTestRecords(0, 100); Map header = + * Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock + * dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 = + * writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying + * without closing the file // writer.close(); + *

+ * writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) + * .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100") + * .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); + * header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = new + * HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size2 = + * writer.getCurrentSize(); assertTrue("We just wrote a new block - size2 should be > size1", size2 > size1); + * assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", + * size2, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); } */ @Test @@ -344,7 +355,7 @@ public void testAppendNotSupported() throws IOException, URISyntaxException, Int Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); for (int i = 0; i < 2; i++) { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath) @@ -371,18 +382,19 @@ public void testBasicWriteAndScan() throws IOException, URISyntaxException, Inte Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); writer.close(); Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it"); HoodieLogBlock nextBlock = reader.next(); - assertEquals(dataBlockType, nextBlock.getBlockType(), "The next block should be a data block"); + assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block"); HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords.size(), dataBlockRead.getRecords().size(), + List recordsRead = getRecords(dataBlockRead); + assertEquals(copyOfRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords, dataBlockRead.getRecords(), + assertEquals(copyOfRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)"); reader.close(); } @@ -400,10 +412,10 @@ public void testHugeLogFileWrite() throws IOException, URISyntaxException, Inter Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - byte[] dataBlockContentBytes = getDataBlock(records, header).getContentBytes(); - HoodieDataBlock reusableDataBlock = new HoodieAvroDataBlock(null, null, - Option.ofNullable(dataBlockContentBytes), false, 0, dataBlockContentBytes.length, - 0, getSimpleSchema(), header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD); + byte[] dataBlockContentBytes = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header).getContentBytes(); + HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(new Configuration(), null, 0, dataBlockContentBytes.length, 0); + HoodieDataBlock reusableDataBlock = new HoodieAvroDataBlock(null, Option.ofNullable(dataBlockContentBytes), false, + logBlockContentLoc, Option.ofNullable(getSimpleSchema()), header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD); long writtenSize = 0; int logBlockWrittenNum = 0; while (writtenSize < Integer.MAX_VALUE) { @@ -418,11 +430,12 @@ public void testHugeLogFileWrite() throws IOException, URISyntaxException, Inter true, true); assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it"); HoodieLogBlock nextBlock = reader.next(); - assertEquals(dataBlockType, nextBlock.getBlockType(), "The next block should be a data block"); + assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block"); HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords.size(), dataBlockRead.getRecords().size(), + List recordsRead = getRecords(dataBlockRead); + assertEquals(copyOfRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords, dataBlockRead.getRecords(), + assertEquals(copyOfRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)"); int logBlockReadNum = 1; while (reader.hasNext()) { @@ -447,11 +460,16 @@ public void testHugeLogFileWrite() throws IOException, URISyntaxException, Inter oversizeWriter.close(); } - @Test - public void testBasicAppendAndRead() throws IOException, URISyntaxException, InterruptedException { - Writer writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + @ParameterizedTest + @EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"}) + public void testBasicAppendAndRead(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException { + Writer writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionPath) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1") + .overBaseCommit("100") + .withFs(fs) + .build(); List records1 = SchemaTestUtil.generateTestRecords(0, 100); Schema schema = getSimpleSchema(); List copyOfRecords1 = records1.stream() @@ -459,30 +477,39 @@ public void testBasicAppendAndRead() throws IOException, URISyntaxException, Int Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records1, header); writer.appendBlock(dataBlock); writer.close(); - writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionPath) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1") + .overBaseCommit("100") + .withFs(fs) + .build(); List records2 = SchemaTestUtil.generateTestRecords(0, 100); List copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records2, header); + dataBlock = getDataBlock(dataBlockType, records2, header); writer.appendBlock(dataBlock); writer.close(); // Close and Open again and append 100 more records - writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionPath) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1") + .overBaseCommit("100") + .withFs(fs) + .build(); + List records3 = SchemaTestUtil.generateTestRecords(0, 100); List copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records3, header); + dataBlock = getDataBlock(dataBlockType, records3, header); writer.appendBlock(dataBlock); writer.close(); @@ -490,26 +517,29 @@ public void testBasicAppendAndRead() throws IOException, URISyntaxException, Int assertTrue(reader.hasNext(), "First block should be available"); HoodieLogBlock nextBlock = reader.next(); HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(), + List recordsRead1 = getRecords(dataBlockRead); + assertEquals(copyOfRecords1.size(),recordsRead1.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords1, dataBlockRead.getRecords(), + assertEquals(copyOfRecords1, recordsRead1, "Both records lists should be the same. (ordering guaranteed)"); assertEquals(dataBlockRead.getSchema(), getSimpleSchema()); reader.hasNext(); nextBlock = reader.next(); dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords2.size(), dataBlockRead.getRecords().size(), + List recordsRead2 = getRecords(dataBlockRead); + assertEquals(copyOfRecords2.size(), recordsRead2.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords2, dataBlockRead.getRecords(), + assertEquals(copyOfRecords2, recordsRead2, "Both records lists should be the same. (ordering guaranteed)"); reader.hasNext(); nextBlock = reader.next(); dataBlockRead = (HoodieDataBlock) nextBlock; - assertEquals(copyOfRecords3.size(), dataBlockRead.getRecords().size(), + List recordsRead3 = getRecords(dataBlockRead); + assertEquals(copyOfRecords3.size(), recordsRead3.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords3, dataBlockRead.getRecords(), + assertEquals(copyOfRecords3, recordsRead3, "Both records lists should be the same. (ordering guaranteed)"); reader.close(); } @@ -538,7 +568,7 @@ public void testBasicAppendAndScanMultipleFiles(ExternalSpillableMap.DiskMapType .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); allRecords.add(copyOfRecords1); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); } writer.close(); @@ -563,7 +593,8 @@ public void testBasicAppendAndScanMultipleFiles(ExternalSpillableMap.DiskMapType List scannedRecords = new ArrayList<>(); for (HoodieRecord record : scanner) { - scannedRecords.add((IndexedRecord) record.getData().getInsertValue(schema).get()); + scannedRecords.add((IndexedRecord) + ((HoodieAvroRecord) record).getData().getInsertValue(schema).get()); } assertEquals(scannedRecords.size(), allRecords.stream().mapToLong(Collection::size).sum(), @@ -580,7 +611,7 @@ public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxExcep Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); writer.close(); @@ -602,11 +633,11 @@ public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxExcep // Append a proper block that is of the missing length of the corrupted block writer = - HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) - .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); + HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 10); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); writer.close(); @@ -644,7 +675,7 @@ public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxExcep .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); writer.close(); @@ -674,7 +705,7 @@ public void testValidateCorruptBlockEndPosition() throws IOException, URISyntaxE Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); writer.close(); @@ -702,7 +733,7 @@ public void testValidateCorruptBlockEndPosition() throws IOException, URISyntaxE .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 10); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - dataBlock = getDataBlock(records, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); writer.close(); @@ -741,7 +772,7 @@ public void testAvroLogRecordReaderBasic(ExternalSpillableMap.DiskMapType diskMa Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); // Write 2 @@ -749,7 +780,7 @@ public void testAvroLogRecordReaderBasic(ExternalSpillableMap.DiskMapType diskMa List copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records2, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); writer.appendBlock(dataBlock); writer.close(); @@ -804,14 +835,14 @@ public void testAvroLogRecordReaderWithRollbackTombstone(ExternalSpillableMap.Di header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records2, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); writer.appendBlock(dataBlock); // Rollback the last write @@ -827,7 +858,7 @@ public void testAvroLogRecordReaderWithRollbackTombstone(ExternalSpillableMap.Di List copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records3, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header); writer.appendBlock(dataBlock); writer.close(); @@ -880,7 +911,7 @@ public void testAvroLogRecordReaderWithFailedPartialBlock(ExternalSpillableMap.D Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); writer.close(); @@ -914,7 +945,7 @@ public void testAvroLogRecordReaderWithFailedPartialBlock(ExternalSpillableMap.D .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records3, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header); writer.appendBlock(dataBlock); writer.close(); @@ -968,7 +999,7 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(ExternalSpillableMap.Di Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); // Write 2 @@ -976,7 +1007,7 @@ public void testAvroLogRecordReaderWithDeleteAndRollback(ExternalSpillableMap.Di List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); - dataBlock = getDataBlock(records2, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); writer.appendBlock(dataBlock); copyOfRecords1.addAll(copyOfRecords2); @@ -1089,13 +1120,13 @@ public void testAvroLogRecordReaderWithFailedRollbacks(ExternalSpillableMap.Disk header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); // Write 2 List records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - dataBlock = getDataBlock(records2, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); writer.appendBlock(dataBlock); // Delete 50 keys @@ -1173,7 +1204,7 @@ public void testAvroLogRecordReaderWithInsertDeleteAndRollback(ExternalSpillable header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); // Delete 50 keys @@ -1232,7 +1263,7 @@ public void testAvroLogRecordReaderWithInvalidRollback(ExternalSpillableMap.Disk Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); FileCreateUtils.createDeltaCommit(basePath, "100", fs); @@ -1290,7 +1321,7 @@ public void testAvroLogRecordReaderWithInsertsDeleteAndRollback(ExternalSpillabl header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); writer.appendBlock(dataBlock); writer.appendBlock(dataBlock); @@ -1354,7 +1385,7 @@ public void testAvroLogRecordReaderWithMixedInsertsCorruptsAndRollback(ExternalS Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); writer.appendBlock(dataBlock); writer.appendBlock(dataBlock); @@ -1473,7 +1504,7 @@ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1 Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records.subList(0, numRecordsInLog1), header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records.subList(0, numRecordsInLog1), header); writer.appendBlock(dataBlock); // Get the size of the block long size = writer.getCurrentSize(); @@ -1487,7 +1518,7 @@ private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1 Map header2 = new HashMap<>(); header2.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header2.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock2 = getDataBlock(records2.subList(0, numRecordsInLog2), header2); + HoodieDataBlock dataBlock2 = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2.subList(0, numRecordsInLog2), header2); writer2.appendBlock(dataBlock2); // Get the size of the block writer2.close(); @@ -1574,7 +1605,7 @@ public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); writer.close(); @@ -1584,7 +1615,7 @@ public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) List records2 = SchemaTestUtil.generateTestRecords(0, 100); List copyOfRecords2 = records2.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); - dataBlock = getDataBlock(records2, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); writer.appendBlock(dataBlock); writer.close(); @@ -1595,7 +1626,7 @@ public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) List records3 = SchemaTestUtil.generateTestRecords(0, 100); List copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); - dataBlock = getDataBlock(records3, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header); writer.appendBlock(dataBlock); writer.close(); @@ -1609,25 +1640,28 @@ public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) HoodieLogBlock prevBlock = reader.prev(); HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock; - assertEquals(copyOfRecords3.size(), dataBlockRead.getRecords().size(), + List recordsRead1 = getRecords(dataBlockRead); + assertEquals(copyOfRecords3.size(), recordsRead1.size(), "Third records size should be equal to the written records size"); - assertEquals(copyOfRecords3, dataBlockRead.getRecords(), + assertEquals(copyOfRecords3, recordsRead1, "Both records lists should be the same. (ordering guaranteed)"); assertTrue(reader.hasPrev(), "Second block should be available"); prevBlock = reader.prev(); dataBlockRead = (HoodieDataBlock) prevBlock; - assertEquals(copyOfRecords2.size(), dataBlockRead.getRecords().size(), + List recordsRead2 = getRecords(dataBlockRead); + assertEquals(copyOfRecords2.size(), recordsRead2.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords2, dataBlockRead.getRecords(), + assertEquals(copyOfRecords2, recordsRead2, "Both records lists should be the same. (ordering guaranteed)"); assertTrue(reader.hasPrev(), "First block should be available"); prevBlock = reader.prev(); dataBlockRead = (HoodieDataBlock) prevBlock; - assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(), + List recordsRead3 = getRecords(dataBlockRead); + assertEquals(copyOfRecords1.size(), recordsRead3.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords1, dataBlockRead.getRecords(), + assertEquals(copyOfRecords1, recordsRead3, "Both records lists should be the same. (ordering guaranteed)"); assertFalse(reader.hasPrev()); @@ -1646,7 +1680,7 @@ public void testAppendAndReadOnCorruptedLogInReverse(boolean readBlocksLazily) Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); writer.close(); @@ -1674,7 +1708,7 @@ public void testAppendAndReadOnCorruptedLogInReverse(boolean readBlocksLazily) HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); - dataBlock = getDataBlock(records, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header); writer.appendBlock(dataBlock); writer.close(); @@ -1708,7 +1742,7 @@ public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieDataBlock dataBlock = getDataBlock(records1, header); + HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header); writer.appendBlock(dataBlock); writer.close(); @@ -1716,7 +1750,7 @@ public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records2 = SchemaTestUtil.generateTestRecords(0, 100); - dataBlock = getDataBlock(records2, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header); writer.appendBlock(dataBlock); writer.close(); @@ -1725,7 +1759,7 @@ public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List records3 = SchemaTestUtil.generateTestRecords(0, 100); - dataBlock = getDataBlock(records3, header); + dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header); writer.appendBlock(dataBlock); writer.close(); @@ -1745,9 +1779,10 @@ public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) assertTrue(reader.hasPrev(), "First block should be available"); HoodieLogBlock prevBlock = reader.prev(); HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock; - assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(), + List recordsRead = getRecords(dataBlockRead); + assertEquals(copyOfRecords1.size(), recordsRead.size(), "Read records size should be equal to the written records size"); - assertEquals(copyOfRecords1, dataBlockRead.getRecords(), + assertEquals(copyOfRecords1, recordsRead, "Both records lists should be the same. (ordering guaranteed)"); assertFalse(reader.hasPrev()); @@ -1770,7 +1805,7 @@ public void testV0Format() throws IOException, URISyntaxException { HoodieLogBlock logBlock = HoodieAvroDataBlock.getBlock(content, schema); assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, logBlock.getBlockType()); - List readRecords = ((HoodieAvroDataBlock) logBlock).getRecords(); + List readRecords = getRecords((HoodieAvroDataBlock) logBlock); assertEquals(readRecords.size(), recordsCopy.size()); for (int i = 0; i < recordsCopy.size(); ++i) { assertEquals(recordsCopy.get(i), readRecords.get(i)); @@ -1779,15 +1814,74 @@ public void testV0Format() throws IOException, URISyntaxException { // Reader schema is optional if it is same as write schema logBlock = HoodieAvroDataBlock.getBlock(content, null); assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, logBlock.getBlockType()); - readRecords = ((HoodieAvroDataBlock) logBlock).getRecords(); + readRecords = getRecords((HoodieAvroDataBlock) logBlock); assertEquals(readRecords.size(), recordsCopy.size()); for (int i = 0; i < recordsCopy.size(); ++i) { assertEquals(recordsCopy.get(i), readRecords.get(i)); } } - private HoodieDataBlock getDataBlock(List records, Map header) { - return getDataBlock(dataBlockType, records, header); + @ParameterizedTest + @EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"}) + public void testDataBlockFormatAppendAndReadWithProjectedSchema( + HoodieLogBlockType dataBlockType + ) throws IOException, URISyntaxException, InterruptedException { + Writer writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionPath) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION) + .withFileId("test-fileid1") + .overBaseCommit("100") + .withFs(fs) + .build(); + + List records = SchemaTestUtil.generateTestGenericRecords(0, 1000); + + Schema schema = getSimpleSchema(); + + Map header = + new HashMap() {{ + put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); + put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); + }}; + + // Init Benchmark to report number of bytes actually read from the Block + BenchmarkCounter.initCounterFromReporter(HadoopMapRedUtils.createTestReporter(), fs.getConf()); + + // NOTE: Have to use this ugly hack since List generic is not covariant in its type param + HoodieDataBlock dataBlock = getDataBlock(dataBlockType, (List)(List) records, header); + + writer.appendBlock(dataBlock); + writer.close(); + + Schema projectedSchema = HoodieAvroUtils.generateProjectionSchema(schema, Collections.singletonList("name")); + + List projectedRecords = HoodieAvroUtils.rewriteRecords(records, projectedSchema); + + try (Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), projectedSchema, true, false)) { + assertTrue(reader.hasNext(), "First block should be available"); + + HoodieLogBlock nextBlock = reader.next(); + + HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock; + + Map expectedReadBytes = + new HashMap() {{ + put(HoodieLogBlockType.AVRO_DATA_BLOCK, 0); // not supported + put(HoodieLogBlockType.HFILE_DATA_BLOCK, 0); // not supported + put(HoodieLogBlockType.PARQUET_DATA_BLOCK, 2605); + }}; + + List recordsRead = getRecords(dataBlockRead); + assertEquals(projectedRecords.size(), recordsRead.size(), + "Read records size should be equal to the written records size"); + assertEquals(projectedRecords, recordsRead, + "Both records lists should be the same. (ordering guaranteed)"); + assertEquals(dataBlockRead.getSchema(), projectedSchema); + + int bytesRead = (int) BenchmarkCounter.getBytesRead(); + + assertEquals(expectedReadBytes.get(dataBlockType), bytesRead, "Read bytes have to match"); + } } private HoodieDataBlock getDataBlock(HoodieLogBlockType dataBlockType, List records, @@ -1796,7 +1890,9 @@ private HoodieDataBlock getDataBlock(HoodieLogBlockType dataBlockType, List testArguments() { arguments(ExternalSpillableMap.DiskMapType.ROCKS_DB, true, true) ); } + + /** + * Utility to convert the given iterator to a List. + */ + private static List getRecords(HoodieDataBlock dataBlock) { + ClosableIterator itr = dataBlock.getRecordItr(); + + List elements = new ArrayList<>(); + itr.forEachRemaining(elements::add); + return elements; + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormatAppendFailure.java b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormatAppendFailure.java index e313bb4a6ca0f..6c4d69a05b296 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormatAppendFailure.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/functional/TestHoodieLogFormatAppendFailure.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.functional; import org.apache.hudi.common.model.HoodieArchivedLogFile; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock; @@ -104,7 +105,7 @@ public void testFailedToGetAppendStreamFromHDFSNameNode() Map header = new HashMap<>(2); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); + HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath) .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive") diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java index e31286d10c2cf..b6bbc34cc3de9 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestHoodieRecord.java @@ -44,7 +44,7 @@ public class TestHoodieRecord { public void setUp() throws Exception { final List indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); final List hoodieRecords = - indexedRecords.stream().map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + indexedRecords.stream().map(r -> new HoodieAvroRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); hoodieRecord = hoodieRecords.get(0); } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/properties/TestTypedProperties.java b/hudi-common/src/test/java/org/apache/hudi/common/properties/TestTypedProperties.java index 95955d4d72a27..a3ba13ec14a2a 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/properties/TestTypedProperties.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/properties/TestTypedProperties.java @@ -19,8 +19,10 @@ package org.apache.hudi.common.properties; import org.apache.hudi.common.config.TypedProperties; + import org.junit.jupiter.api.Test; +import java.io.IOException; import java.util.Properties; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -81,4 +83,54 @@ public void testGetBoolean() { assertEquals(true, typedProperties.getBoolean("key1", false)); assertEquals(false, typedProperties.getBoolean("key2", false)); } + + @Test + public void testPropertiesOrder() throws IOException { + Properties properties = new TypedProperties(); + properties.put("key0", "true"); + properties.put("key1", "false"); + properties.put("key2", "true"); + properties.put("key3", "false"); + properties.put("key4", "true"); + properties.put("key5", "true"); + properties.put("key6", "false"); + properties.put("key7", "true"); + properties.put("key8", "false"); + properties.put("key9", "true"); + + TypedProperties typedProperties = new TypedProperties(properties); + assertTypeProperties(typedProperties, 0); + } + + @Test + void testPutAllProperties() { + Properties firstProp = new TypedProperties(); + firstProp.put("key0", "true"); + firstProp.put("key1", "false"); + firstProp.put("key2", "true"); + + TypedProperties firstProperties = new TypedProperties(firstProp); + assertTypeProperties(firstProperties, 0); + + TypedProperties secondProperties = new TypedProperties(); + secondProperties.put("key3", "true"); + secondProperties.put("key4", "false"); + secondProperties.put("key5", "true"); + assertTypeProperties(secondProperties, 3); + + TypedProperties thirdProperties = new TypedProperties(); + thirdProperties.putAll(firstProp); + thirdProperties.putAll(secondProperties); + + assertEquals(3, firstProp.stringPropertyNames().size()); + assertEquals(3, secondProperties.stringPropertyNames().size()); + assertEquals(6, thirdProperties.stringPropertyNames().size()); + } + + private void assertTypeProperties(TypedProperties typedProperties, int start) { + String[] props = typedProperties.stringPropertyNames().toArray(new String[0]); + for (int i = start; i < props.length; i++) { + assertEquals(String.format("key%d", i), props[i]); + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java index 73d101cf2c71f..f21d8e6dc37e5 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableConfig.java @@ -64,7 +64,7 @@ public void setUp() throws Exception { public void testCreate() throws IOException { assertTrue(fs.exists(new Path(metaPath, HoodieTableConfig.HOODIE_PROPERTIES_FILE))); HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); - assertEquals(4, config.getProps().size()); + assertEquals(5, config.getProps().size()); } @Test @@ -77,7 +77,7 @@ public void testUpdate() throws IOException { assertTrue(fs.exists(cfgPath)); assertFalse(fs.exists(backupCfgPath)); HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); - assertEquals(5, config.getProps().size()); + assertEquals(6, config.getProps().size()); assertEquals("test-table2", config.getTableName()); assertEquals("new_field", config.getPreCombineField()); } @@ -90,7 +90,7 @@ public void testDelete() throws IOException { assertTrue(fs.exists(cfgPath)); assertFalse(fs.exists(backupCfgPath)); HoodieTableConfig config = new HoodieTableConfig(fs, metaPath.toString(), null); - assertEquals(3, config.getProps().size()); + assertEquals(4, config.getProps().size()); assertNull(config.getProps().getProperty("hoodie.invalid.config")); assertFalse(config.getProps().contains(HoodieTableConfig.ARCHIVELOG_FOLDER.key())); } @@ -114,7 +114,7 @@ public void testReadsWithUpdateFailures() throws IOException { assertFalse(fs.exists(cfgPath)); assertTrue(fs.exists(backupCfgPath)); config = new HoodieTableConfig(fs, metaPath.toString(), null); - assertEquals(4, config.getProps().size()); + assertEquals(5, config.getProps().size()); } @ParameterizedTest @@ -132,6 +132,6 @@ public void testUpdateRecovery(boolean shouldPropsFileExist) throws IOException assertTrue(fs.exists(cfgPath)); assertFalse(fs.exists(backupCfgPath)); config = new HoodieTableConfig(fs, metaPath.toString(), null); - assertEquals(4, config.getProps().size()); + assertEquals(5, config.getProps().size()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java index 586a451065823..840e6ddf4ad3f 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestHoodieTableMetaClient.java @@ -54,6 +54,8 @@ public void checkMetadata() { assertEquals(basePath, metaClient.getBasePath(), "Basepath should be the one assigned"); assertEquals(basePath + "/.hoodie", metaClient.getMetaPath(), "Metapath should be ${basepath}/.hoodie"); + assertTrue(metaClient.getTableConfig().getProps().containsKey(HoodieTableConfig.TABLE_CHECKSUM.key())); + assertTrue(HoodieTableConfig.validateChecksum(metaClient.getTableConfig().getProps())); } @Test diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java index 9397295013ea1..22ceb5bfef373 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/TestTimelineUtils.java @@ -270,8 +270,7 @@ private HoodieRollbackMetadata getRollbackMetadataInstance(String basePath, Stri List rollbacks = new ArrayList<>(); rollbacks.add(new HoodieInstant(false, actionType, commitTs)); - HoodieRollbackStat rollbackStat = new HoodieRollbackStat(partition, deletedFiles, Collections.emptyList(), Collections.emptyMap(), - Collections.EMPTY_MAP); + HoodieRollbackStat rollbackStat = new HoodieRollbackStat(partition, deletedFiles, Collections.emptyList(), Collections.emptyMap()); List rollbackStats = new ArrayList<>(); rollbackStats.add(rollbackStat); return TimelineMetadataUtils.convertRollbackMetadata(commitTs, Option.empty(), rollbacks, rollbackStats); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java b/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java index 9d89c2a6b5feb..576cfd7cb0f3f 100755 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/timeline/TestHoodieActiveTimeline.java @@ -126,6 +126,7 @@ public void testLoadingInstantsFromFiles() throws IOException { HoodieActiveTimeline oldTimeline = new HoodieActiveTimeline( HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()) .setLoadActiveTimelineOnLoad(true).setConsistencyGuardConfig(metaClient.getConsistencyGuardConfig()) + .setFileSystemRetryConfig(metaClient.getFileSystemRetryConfig()) .setLayoutVersion(Option.of(new TimelineLayoutVersion(VERSION_0))).build()); // Old Timeline writes both to aux and timeline folder oldTimeline.saveToCompactionRequested(instant6, Option.of(dummy)); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java index 924c6724e7b22..54bc138fc8f84 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestHoodieTableFileSystemView.java @@ -31,6 +31,7 @@ import org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter; import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.model.BaseFile; import org.apache.hudi.common.model.BootstrapFileMapping; import org.apache.hudi.common.model.CompactionOperation; @@ -41,6 +42,7 @@ import org.apache.hudi.common.model.HoodieFileGroupId; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; @@ -50,12 +52,15 @@ import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; import org.apache.hudi.common.table.view.TableFileSystemView.SliceView; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestTable; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.ClusteringUtils; import org.apache.hudi.common.util.CommitUtils; import org.apache.hudi.common.util.CompactionUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ImmutablePair; import org.apache.hudi.common.util.collection.Pair; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.junit.jupiter.api.BeforeEach; @@ -1537,6 +1542,234 @@ public void testPendingClusteringOperations() throws IOException { assertFalse(fileIds.contains(fileId3)); } + /** + * + * create hoodie table like + * . + * ├── .hoodie + * │   ├── .aux + * │   │   └── .bootstrap + * │   │   ├── .fileids + * │   │   └── .partitions + * │   ├── .temp + * │   ├── 1.commit + * │   ├── 1.commit.requested + * │   ├── 1.inflight + * │   ├── 2.replacecommit + * │   ├── 2.replacecommit.inflight + * │   ├── 2.replacecommit.requested + * │   ├── 3.commit + * │   ├── 3.commit.requested + * │   ├── 3.inflight + * │   ├── archived + * │   └── hoodie.properties + * └── 2020 + * └── 06 + * └── 27 + * ├── 5fe477d2-0150-46d4-833c-1e9cc8da9948_1-0-1_3.parquet + * ├── 7e3208c8-fdec-4254-9682-8fff1e51ee8d_1-0-1_2.parquet + * ├── e04b0e2d-1467-46b2-8ea6-f4fe950965a5_1-0-1_1.parquet + * └── f3936b66-b3db-4fc8-a6d0-b1a7559016e6_1-0-1_1.parquet + * + * First test fsView API with finished clustering: + * 1. getLatestBaseFilesBeforeOrOn + * 2. getBaseFileOn + * 3. getLatestBaseFilesInRange + * 4. getAllBaseFiles + * 5. getLatestBaseFiles + * + * Then remove 2.replacecommit, 1.commit, 1.commit.requested, 1.inflight to simulate + * pending clustering at the earliest position in the active timeline and test these APIs again. + * + * @throws IOException + */ + @Test + public void testHoodieTableFileSystemViewWithPendingClustering() throws IOException { + List latestBaseFilesBeforeOrOn; + Option baseFileOn; + List latestBaseFilesInRange; + List allBaseFiles; + List latestBaseFiles; + List latestBaseFilesPerPartition; + String partitionPath = "2020/06/27"; + new File(basePath + "/" + partitionPath).mkdirs(); + HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline(); + + // will create 5 fileId in partition. + // fileId1 and fileId2 will be replaced by fileID3 + // fileId4 and fileId5 will be committed after clustering finished. + String fileId1 = UUID.randomUUID().toString(); + String fileId2 = UUID.randomUUID().toString(); + String fileId3 = UUID.randomUUID().toString(); + String fileId4 = UUID.randomUUID().toString(); + String fileId5 = UUID.randomUUID().toString(); + + assertFalse(roView.getLatestBaseFiles(partitionPath) + .anyMatch(dfile -> dfile.getFileId().equals(fileId1) + || dfile.getFileId().equals(fileId2) + || dfile.getFileId().equals(fileId3) + || dfile.getFileId().equals(fileId4) + || dfile.getFileId().equals(fileId5)), + "No commit, should not find any data file"); + + // first insert commit + String commitTime1 = "1"; + String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1); + String fileName2 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2); + new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile(); + new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile(); + + HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1); + + // build writeStats + HashMap> partitionToFile1 = new HashMap<>(); + ArrayList files1 = new ArrayList<>(); + files1.add(fileId1); + files1.add(fileId2); + partitionToFile1.put(partitionPath, files1); + List writeStats1 = buildWriteStats(partitionToFile1, commitTime1); + + HoodieCommitMetadata commitMetadata1 = + CommitUtils.buildMetadata(writeStats1, new HashMap<>(), Option.empty(), WriteOperationType.INSERT, "", HoodieTimeline.COMMIT_ACTION); + saveAsComplete(commitTimeline, instant1, Option.of(commitMetadata1.toJsonString().getBytes(StandardCharsets.UTF_8))); + commitTimeline.reload(); + + // replace commit + String commitTime2 = "2"; + String fileName3 = FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId3); + new File(basePath + "/" + partitionPath + "/" + fileName3).createNewFile(); + + HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, commitTime2); + Map> partitionToReplaceFileIds = new HashMap<>(); + List replacedFileIds = new ArrayList<>(); + replacedFileIds.add(fileId1); + replacedFileIds.add(fileId2); + partitionToReplaceFileIds.put(partitionPath, replacedFileIds); + + HashMap> partitionToFile2 = new HashMap<>(); + ArrayList files2 = new ArrayList<>(); + files2.add(fileId3); + partitionToFile2.put(partitionPath, files2); + List writeStats2 = buildWriteStats(partitionToFile2, commitTime2); + + HoodieCommitMetadata commitMetadata2 = + CommitUtils.buildMetadata(writeStats2, partitionToReplaceFileIds, Option.empty(), WriteOperationType.INSERT_OVERWRITE, "", HoodieTimeline.REPLACE_COMMIT_ACTION); + saveAsComplete(commitTimeline, instant2, Option.of(commitMetadata2.toJsonString().getBytes(StandardCharsets.UTF_8))); + + // another insert commit + String commitTime3 = "3"; + String fileName4 = FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId4); + new File(basePath + "/" + partitionPath + "/" + fileName4).createNewFile(); + HoodieInstant instant3 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime3); + + // build writeStats + HashMap> partitionToFile3 = new HashMap<>(); + ArrayList files3 = new ArrayList<>(); + files3.add(fileId4); + partitionToFile3.put(partitionPath, files3); + List writeStats3 = buildWriteStats(partitionToFile3, commitTime3); + HoodieCommitMetadata commitMetadata3 = + CommitUtils.buildMetadata(writeStats3, new HashMap<>(), Option.empty(), WriteOperationType.INSERT, "", HoodieTimeline.COMMIT_ACTION); + saveAsComplete(commitTimeline, instant3, Option.of(commitMetadata3.toJsonString().getBytes(StandardCharsets.UTF_8))); + + metaClient.reloadActiveTimeline(); + refreshFsView(); + + ArrayList commits = new ArrayList<>(); + commits.add(commitTime1); + commits.add(commitTime2); + commits.add(commitTime3); + + // do check + latestBaseFilesBeforeOrOn = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, commitTime3).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, latestBaseFilesBeforeOrOn.size()); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId3)); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId4)); + + // could see fileId3 because clustering is committed. + baseFileOn = fsView.getBaseFileOn(partitionPath, commitTime2, fileId3); + assertTrue(baseFileOn.isPresent()); + assertEquals(baseFileOn.get().getFileId(), fileId3); + + latestBaseFilesInRange = fsView.getLatestBaseFilesInRange(commits).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, latestBaseFilesInRange.size()); + assertTrue(latestBaseFilesInRange.contains(fileId3)); + assertTrue(latestBaseFilesInRange.contains(fileId4)); + + allBaseFiles = fsView.getAllBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, allBaseFiles.size()); + assertTrue(allBaseFiles.contains(fileId3)); + assertTrue(allBaseFiles.contains(fileId4)); + + // could see fileId3 because clustering is committed. + latestBaseFiles = fsView.getLatestBaseFiles().map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, latestBaseFiles.size()); + assertTrue(allBaseFiles.contains(fileId3)); + assertTrue(allBaseFiles.contains(fileId4)); + + // could see fileId3 because clustering is committed. + latestBaseFilesPerPartition = fsView.getLatestBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(2, latestBaseFiles.size()); + assertTrue(latestBaseFilesPerPartition.contains(fileId3)); + assertTrue(latestBaseFilesPerPartition.contains(fileId4)); + + HoodieWrapperFileSystem fs = metaClient.getFs(); + fs.delete(new Path(basePath + "/.hoodie", "1.commit"), false); + fs.delete(new Path(basePath + "/.hoodie", "1.inflight"), false); + fs.delete(new Path(basePath + "/.hoodie", "1.commit.requested"), false); + fs.delete(new Path(basePath + "/.hoodie", "2.replacecommit"), false); + + metaClient.reloadActiveTimeline(); + refreshFsView(); + // do check after delete some commit file + latestBaseFilesBeforeOrOn = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, commitTime3).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, latestBaseFilesBeforeOrOn.size()); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId1)); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId2)); + assertTrue(latestBaseFilesBeforeOrOn.contains(fileId4)); + + // couldn't see fileId3 because clustering is not committed. + baseFileOn = fsView.getBaseFileOn(partitionPath, commitTime2, fileId3); + assertFalse(baseFileOn.isPresent()); + + latestBaseFilesInRange = fsView.getLatestBaseFilesInRange(commits).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, latestBaseFilesInRange.size()); + assertTrue(latestBaseFilesInRange.contains(fileId1)); + assertTrue(latestBaseFilesInRange.contains(fileId2)); + assertTrue(latestBaseFilesInRange.contains(fileId4)); + + allBaseFiles = fsView.getAllBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, allBaseFiles.size()); + assertTrue(allBaseFiles.contains(fileId1)); + assertTrue(allBaseFiles.contains(fileId2)); + assertTrue(allBaseFiles.contains(fileId4)); + + // couldn't see fileId3 because clustering is not committed. + latestBaseFiles = fsView.getLatestBaseFiles().map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, latestBaseFiles.size()); + assertTrue(allBaseFiles.contains(fileId1)); + assertTrue(allBaseFiles.contains(fileId2)); + assertTrue(allBaseFiles.contains(fileId4)); + + // couldn't see fileId3 because clustering is not committed. + latestBaseFilesPerPartition = fsView.getLatestBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList()); + assertEquals(3, latestBaseFiles.size()); + assertTrue(latestBaseFilesPerPartition.contains(fileId1)); + assertTrue(latestBaseFilesPerPartition.contains(fileId2)); + assertTrue(latestBaseFilesPerPartition.contains(fileId4)); + } + + + // Generate Hoodie WriteStat For Given Partition + private List buildWriteStats(HashMap> partitionToFileIds, String commitTime) { + HashMap>> maps = new HashMap<>(); + for (String partition : partitionToFileIds.keySet()) { + List> list = partitionToFileIds.get(partition).stream().map(fileId -> new ImmutablePair(fileId, 0)).collect(Collectors.toList()); + maps.put(partition, list); + } + return HoodieTestTable.generateHoodieWriteStatForPartition(maps, commitTime, false); + } + @Override protected HoodieTableType getTableType() { return HoodieTableType.MERGE_ON_READ; diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java index 0bcebaf71e9ff..a9c9db303f328 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestIncrementalFSViewSync.java @@ -556,7 +556,7 @@ private void performRestore(HoodieInstant instant, List files, String ro boolean isRestore) throws IOException { Map> partititonToFiles = deleteFiles(files); List rollbackStats = partititonToFiles.entrySet().stream().map(e -> - new HoodieRollbackStat(e.getKey(), e.getValue(), new ArrayList<>(), new HashMap<>(), new HashMap<>()) + new HoodieRollbackStat(e.getKey(), e.getValue(), new ArrayList<>(), new HashMap<>()) ).collect(Collectors.toList()); List rollbacks = new ArrayList<>(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FixtureUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FixtureUtils.java deleted file mode 100644 index 6dfe0da797f8e..0000000000000 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FixtureUtils.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hudi.common.testutils; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.net.URL; -import java.nio.file.Path; -import java.util.Objects; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; - -public final class FixtureUtils { - - public static Path prepareFixtureTable(URL fixtureResource, Path basePath) throws IOException { - File zippedFixtureTable = new File(fixtureResource.getFile()); - try (ZipInputStream zis = new ZipInputStream(new FileInputStream(zippedFixtureTable))) { - byte[] buffer = new byte[1024]; - ZipEntry zipEntry = zis.getNextEntry(); - Path tableBasePath = basePath.resolve(Objects.requireNonNull(zipEntry).getName() - .replaceAll(File.separator + "$", "")); - while (zipEntry != null) { - File newFile = newFile(basePath.toFile(), zipEntry); - if (zipEntry.isDirectory()) { - if (!newFile.isDirectory() && !newFile.mkdirs()) { - throw new IOException("Failed to create directory " + newFile); - } - } else { - // fix for Windows-created archives - File parent = newFile.getParentFile(); - if (!parent.isDirectory() && !parent.mkdirs()) { - throw new IOException("Failed to create directory " + parent); - } - - // write file content - try (FileOutputStream fos = new FileOutputStream(newFile)) { - int len; - while ((len = zis.read(buffer)) > 0) { - fos.write(buffer, 0, len); - } - } - } - zipEntry = zis.getNextEntry(); - } - zis.closeEntry(); - return tableBasePath; - } - } - - public static File newFile(File destinationDir, ZipEntry zipEntry) throws IOException { - File destFile = new File(destinationDir, zipEntry.getName()); - - String destDirPath = destinationDir.getCanonicalPath(); - String destFilePath = destFile.getCanonicalPath(); - - if (!destFilePath.startsWith(destDirPath + File.separator)) { - throw new IOException("Entry is outside of the target dir: " + zipEntry.getName()); - } - - return destFile; - } -} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HadoopMapRedUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HadoopMapRedUtils.java new file mode 100644 index 0000000000000..a06039b5fba35 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HadoopMapRedUtils.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.testutils; + +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hudi.common.util.Option; + +import java.util.concurrent.ConcurrentHashMap; + +public class HadoopMapRedUtils { + + /** + * Creates instance of {@link Reporter} to collect reported counters + */ + public static Reporter createTestReporter() { + class TestReporter implements Reporter { + private final ConcurrentHashMap counters = + new ConcurrentHashMap<>(); + + @Override + public void setStatus(String status) { + // not-supported + } + + @Override + public Counters.Counter getCounter(Enum name) { + return counters.computeIfAbsent(name.name(), (ignored) -> new Counters.Counter()); + } + + @Override + public Counters.Counter getCounter(String group, String name) { + return counters.computeIfAbsent(getKey(group, name), (ignored) -> new Counters.Counter()); + } + + @Override + public void incrCounter(Enum key, long amount) { + Option.ofNullable(counters.get(key)) + .ifPresent(c -> c.increment(amount)); + } + + @Override + public void incrCounter(String group, String counter, long amount) { + Option.ofNullable(counters.get(getKey(group, counter))) + .ifPresent(c -> c.increment(amount)); + } + + @Override + public InputSplit getInputSplit() throws UnsupportedOperationException { + throw new UnsupportedOperationException("not supported"); + } + + @Override + public float getProgress() { + return -1; + } + + @Override + public void progress() { + // not-supported + } + + private String getKey(String group, String name) { + return String.format("%s:%s", group, name); + } + } + + return new TestReporter(); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java index 21816a56c2db2..3e147b7fdd47c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java @@ -19,10 +19,22 @@ package org.apache.hudi.common.testutils; +import org.apache.avro.Conversions; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericArray; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodiePartitionMetadata; @@ -33,29 +45,22 @@ import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.AvroOrcUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; - -import org.apache.avro.Conversions; -import org.apache.avro.LogicalTypes; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericArray; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericFixed; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.orc.TypeDescription; import java.io.IOException; import java.io.Serializable; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; import java.math.BigDecimal; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; -import java.sql.Date; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -139,7 +144,7 @@ public class HoodieTestDataGenerator implements AutoCloseable { public static final TypeDescription ORC_TRIP_SCHEMA = AvroOrcUtils.createOrcSchema(new Schema.Parser().parse(TRIP_SCHEMA)); public static final Schema FLATTENED_AVRO_SCHEMA = new Schema.Parser().parse(TRIP_FLATTENED_SCHEMA); - private static final Random RAND = new Random(46474747); + private final Random rand; //Maintains all the existing keys schema wise private final Map> existingKeysBySchema; @@ -147,27 +152,58 @@ public class HoodieTestDataGenerator implements AutoCloseable { //maintains the count of existing keys schema wise private Map numKeysBySchema; + public HoodieTestDataGenerator(long seed) { + this(seed, DEFAULT_PARTITION_PATHS, new HashMap<>()); + } + + public HoodieTestDataGenerator(long seed, String[] partitionPaths, Map keyPartitionMap) { + this.rand = new Random(seed); + this.partitionPaths = Arrays.copyOf(partitionPaths, partitionPaths.length); + this.existingKeysBySchema = new HashMap<>(); + this.existingKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap); + this.numKeysBySchema = new HashMap<>(); + this.numKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap.size()); + + logger.info(String.format("Test DataGenerator's seed (%s)", seed)); + } + + ////////////////////////////////////////////////////////////////////////////////// + // DEPRECATED API + ////////////////////////////////////////////////////////////////////////////////// + + @Deprecated public HoodieTestDataGenerator(String[] partitionPaths) { this(partitionPaths, new HashMap<>()); } + @Deprecated public HoodieTestDataGenerator() { this(DEFAULT_PARTITION_PATHS); } + @Deprecated public HoodieTestDataGenerator(String[] partitionPaths, Map keyPartitionMap) { - this.partitionPaths = Arrays.copyOf(partitionPaths, partitionPaths.length); - this.existingKeysBySchema = new HashMap<>(); - existingKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap); - numKeysBySchema = new HashMap<>(); - numKeysBySchema.put(TRIP_EXAMPLE_SCHEMA, keyPartitionMap.size()); + // NOTE: This used as a workaround to make sure that new instantiations of the generator + // always return "new" random values. + // Caveat is that if 2 successive invocations are made w/in the timespan that is smaller + // than the resolution of {@code nanoTime}, then this will produce identical results + this(System.nanoTime(), partitionPaths, keyPartitionMap); + } + + /** + * @deprecated please use non-static version + */ + public static void writePartitionMetadataDeprecated(FileSystem fs, String[] partitionPaths, String basePath) { + new HoodieTestDataGenerator().writePartitionMetadata(fs, partitionPaths, basePath); } + ////////////////////////////////////////////////////////////////////////////////// + /** * @implNote {@link HoodieTestDataGenerator} is supposed to just generate records with schemas. Leave HoodieTable files (metafile, basefile, logfile, etc) to {@link HoodieTestTable}. * @deprecated Use {@link HoodieTestTable#withPartitionMetaFiles(java.lang.String...)} instead. */ - public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) { + public void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) { for (String partitionPath : partitionPaths) { new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0); } @@ -197,7 +233,7 @@ public RawTripTestPayload generateRandomValueAsPerSchema(String schemaStr, Hoodi * @param instantTime Instant time to use. * @return Raw paylaod of a test record. */ - public static RawTripTestPayload generateRandomValue(HoodieKey key, String instantTime) throws IOException { + public RawTripTestPayload generateRandomValue(HoodieKey key, String instantTime) throws IOException { return generateRandomValue(key, instantTime, false); } @@ -211,12 +247,12 @@ public static RawTripTestPayload generateRandomValue(HoodieKey key, String insta * @return Raw paylaod of a test record. * @throws IOException */ - public static RawTripTestPayload generateRandomValue( + private RawTripTestPayload generateRandomValue( HoodieKey key, String instantTime, boolean isFlattened) throws IOException { return generateRandomValue(key, instantTime, isFlattened, 0); } - public static RawTripTestPayload generateRandomValue( + private RawTripTestPayload generateRandomValue( HoodieKey key, String instantTime, boolean isFlattened, int ts) throws IOException { GenericRecord rec = generateGenericRecord( key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, ts, @@ -240,7 +276,7 @@ public RawTripTestPayload generatePayloadForShortTripSchema(HoodieKey key, Strin /** * Generates a new avro record of the above schema format for a delete. */ - public static RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String instantTime) throws IOException { + private RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String instantTime) throws IOException { GenericRecord rec = generateGenericRecord(key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, 0, true, false); return new RawTripTestPayload(Option.of(rec.toString()), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA, true, 0L); @@ -249,17 +285,17 @@ public static RawTripTestPayload generateRandomDeleteValue(HoodieKey key, String /** * Generates a new avro record of the above schema format, retaining the key if optionally provided. */ - public static HoodieAvroPayload generateAvroPayload(HoodieKey key, String instantTime) { + private HoodieAvroPayload generateAvroPayload(HoodieKey key, String instantTime) { GenericRecord rec = generateGenericRecord(key.getRecordKey(), key.getPartitionPath(), "rider-" + instantTime, "driver-" + instantTime, 0); return new HoodieAvroPayload(Option.of(rec)); } - public static GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName, - long timestamp) { + public GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName, + long timestamp) { return generateGenericRecord(rowKey, partitionPath, riderName, driverName, timestamp, false, false); } - public static GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName, + public GenericRecord generateGenericRecord(String rowKey, String partitionPath, String riderName, String driverName, long timestamp, boolean isDeleteRecord, boolean isFlattened) { GenericRecord rec = new GenericData.Record(isFlattened ? FLATTENED_AVRO_SCHEMA : AVRO_SCHEMA); @@ -268,25 +304,25 @@ public static GenericRecord generateGenericRecord(String rowKey, String partitio rec.put("partition_path", partitionPath); rec.put("rider", riderName); rec.put("driver", driverName); - rec.put("begin_lat", RAND.nextDouble()); - rec.put("begin_lon", RAND.nextDouble()); - rec.put("end_lat", RAND.nextDouble()); - rec.put("end_lon", RAND.nextDouble()); + rec.put("begin_lat", rand.nextDouble()); + rec.put("begin_lon", rand.nextDouble()); + rec.put("end_lat", rand.nextDouble()); + rec.put("end_lon", rand.nextDouble()); if (isFlattened) { - rec.put("fare", RAND.nextDouble() * 100); + rec.put("fare", rand.nextDouble() * 100); rec.put("currency", "USD"); } else { - rec.put("distance_in_meters", RAND.nextInt()); - rec.put("seconds_since_epoch", RAND.nextLong()); - rec.put("weight", RAND.nextFloat()); + rec.put("distance_in_meters", rand.nextInt()); + rec.put("seconds_since_epoch", rand.nextLong()); + rec.put("weight", rand.nextFloat()); byte[] bytes = "Canada".getBytes(); rec.put("nation", ByteBuffer.wrap(bytes)); - long currentTimeMillis = System.currentTimeMillis(); - Date date = new Date(currentTimeMillis); - rec.put("current_date", (int) date.toLocalDate().toEpochDay()); - rec.put("current_ts", currentTimeMillis); + long randomMillis = genRandomTimeMillis(rand); + Instant instant = Instant.ofEpochMilli(randomMillis); + rec.put("current_date", (int) LocalDateTime.ofInstant(instant, ZoneOffset.UTC).toLocalDate().toEpochDay()); + rec.put("current_ts", randomMillis); - BigDecimal bigDecimal = new BigDecimal(String.format("%5f", RAND.nextFloat())); + BigDecimal bigDecimal = new BigDecimal(String.format("%5f", rand.nextFloat())); Schema decimalSchema = AVRO_SCHEMA.getField("height").schema(); Conversions.DecimalConversion decimalConversions = new Conversions.DecimalConversion(); GenericFixed genericFixed = decimalConversions.toFixed(bigDecimal, decimalSchema, LogicalTypes.decimal(10, 6)); @@ -295,14 +331,14 @@ public static GenericRecord generateGenericRecord(String rowKey, String partitio rec.put("city_to_state", Collections.singletonMap("LA", "CA")); GenericRecord fareRecord = new GenericData.Record(AVRO_SCHEMA.getField("fare").schema()); - fareRecord.put("amount", RAND.nextDouble() * 100); + fareRecord.put("amount", rand.nextDouble() * 100); fareRecord.put("currency", "USD"); rec.put("fare", fareRecord); GenericArray tipHistoryArray = new GenericData.Array<>(1, AVRO_SCHEMA.getField("tip_history").schema()); Schema tipSchema = new Schema.Parser().parse(AVRO_SCHEMA.getField("tip_history").schema().toString()).getElementType(); GenericRecord tipRecord = new GenericData.Record(tipSchema); - tipRecord.put("amount", RAND.nextDouble() * 100); + tipRecord.put("amount", rand.nextDouble() * 100); tipRecord.put("currency", "USD"); tipHistoryArray.add(tipRecord); rec.put("tip_history", tipHistoryArray); @@ -325,7 +361,7 @@ public GenericRecord generateRecordForTripSchema(String rowKey, String riderName rec.put("timestamp", timestamp); rec.put("rider", riderName); rec.put("driver", driverName); - rec.put("fare", RAND.nextDouble() * 100); + rec.put("fare", rand.nextDouble() * 100); rec.put("_hoodie_is_deleted", false); return rec; } @@ -336,7 +372,7 @@ public GenericRecord generateRecordForShortTripSchema(String rowKey, String ride rec.put("timestamp", timestamp); rec.put("rider", riderName); rec.put("driver", driverName); - rec.put("fare", RAND.nextDouble() * 100); + rec.put("fare", rand.nextDouble() * 100); rec.put("_hoodie_is_deleted", false); return rec; } @@ -346,7 +382,7 @@ public static void createCommitFile(String basePath, String instantTime, Configu createCommitFile(basePath, instantTime, configuration, commitMetadata); } - public static void createCommitFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { + private static void createCommitFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { Arrays.asList(HoodieTimeline.makeCommitFileName(instantTime), HoodieTimeline.makeInflightCommitFileName(instantTime), HoodieTimeline.makeRequestedCommitFileName(instantTime)) .forEach(f -> createMetadataFile(f, basePath, configuration, commitMetadata)); @@ -382,13 +418,7 @@ private static void createMetadataFile(String f, String basePath, Configuration } } - public static void createReplaceFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { - Arrays.asList(HoodieTimeline.makeReplaceFileName(instantTime), HoodieTimeline.makeInflightReplaceFileName(instantTime), - HoodieTimeline.makeRequestedReplaceFileName(instantTime)) - .forEach(f -> createMetadataFile(f, basePath, configuration, commitMetadata)); - } - - public static void createPendingReplaceFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { + private static void createPendingReplaceFile(String basePath, String instantTime, Configuration configuration, HoodieCommitMetadata commitMetadata) { Arrays.asList(HoodieTimeline.makeInflightReplaceFileName(instantTime), HoodieTimeline.makeRequestedReplaceFileName(instantTime)) .forEach(f -> createMetadataFile(f, basePath, configuration, commitMetadata)); @@ -406,13 +436,6 @@ public static void createEmptyCleanRequestedFile(String basePath, String instant createEmptyFile(basePath, commitFile, configuration); } - public static void createCompactionRequestedFile(String basePath, String instantTime, Configuration configuration) - throws IOException { - Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" - + HoodieTimeline.makeRequestedCompactionFileName(instantTime)); - createEmptyFile(basePath, commitFile, configuration); - } - private static void createEmptyFile(String basePath, Path filePath, Configuration configuration) throws IOException { FileSystem fs = FSUtils.getFs(basePath, configuration); FSDataOutputStream os = fs.create(filePath, true); @@ -483,13 +506,13 @@ public List generateInsertsContainsAllPartitions(String instantTim } public List generateInsertsForPartition(String instantTime, Integer n, String partition) { - return generateInsertsStream(instantTime, n, false, TRIP_EXAMPLE_SCHEMA, false, () -> partition, () -> UUID.randomUUID().toString()).collect(Collectors.toList()); + return generateInsertsStream(instantTime, n, false, TRIP_EXAMPLE_SCHEMA, false, () -> partition, () -> genPseudoRandomUUID(rand).toString()).collect(Collectors.toList()); } public Stream generateInsertsStream(String commitTime, Integer n, boolean isFlattened, String schemaStr, boolean containsAllPartitions) { return generateInsertsStream(commitTime, n, isFlattened, schemaStr, containsAllPartitions, - () -> partitionPaths[RAND.nextInt(partitionPaths.length)], - () -> UUID.randomUUID().toString()); + () -> partitionPaths[rand.nextInt(partitionPaths.length)], + () -> genPseudoRandomUUID(rand).toString()); } /** @@ -510,7 +533,7 @@ public Stream generateInsertsStream(String instantTime, Integer n, populateKeysBySchema(schemaStr, currSize + i, kp); incrementNumExistingKeysBySchema(schemaStr); try { - return new HoodieRecord(key, generateRandomValueAsPerSchema(schemaStr, key, instantTime, isFlattened)); + return new HoodieAvroRecord(key, generateRandomValueAsPerSchema(schemaStr, key, instantTime, isFlattened)); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -541,7 +564,7 @@ public List generateSameKeyInserts(String instantTime, List copy = new ArrayList<>(); for (HoodieRecord r : origin) { HoodieKey key = r.getKey(); - HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, instantTime)); + HoodieRecord record = new HoodieAvroRecord(key, generateRandomValue(key, instantTime)); copy.add(record); } return copy; @@ -551,9 +574,9 @@ public List generateInsertsWithHoodieAvroPayload(String instantTim List inserts = new ArrayList<>(); int currSize = getNumExistingKeys(TRIP_EXAMPLE_SCHEMA); for (int i = 0; i < limit; i++) { - String partitionPath = partitionPaths[RAND.nextInt(partitionPaths.length)]; - HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); - HoodieRecord record = new HoodieRecord(key, generateAvroPayload(key, instantTime)); + String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)]; + HoodieKey key = new HoodieKey(genPseudoRandomUUID(rand).toString(), partitionPath); + HoodieRecord record = new HoodieAvroRecord(key, generateAvroPayload(key, instantTime)); inserts.add(record); KeyPartition kp = new KeyPartition(); @@ -568,7 +591,7 @@ public List generateInsertsWithHoodieAvroPayload(String instantTim public List generateUpdatesWithHoodieAvroPayload(String instantTime, List baseRecords) { List updates = new ArrayList<>(); for (HoodieRecord baseRecord : baseRecords) { - HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateAvroPayload(baseRecord.getKey(), instantTime)); + HoodieRecord record = new HoodieAvroRecord(baseRecord.getKey(), generateAvroPayload(baseRecord.getKey(), instantTime)); updates.add(record); } return updates; @@ -596,11 +619,11 @@ public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOE public HoodieRecord generateDeleteRecord(HoodieKey key) throws IOException { RawTripTestPayload payload = new RawTripTestPayload(Option.empty(), key.getRecordKey(), key.getPartitionPath(), null, true, 0L); - return new HoodieRecord(key, payload); + return new HoodieAvroRecord(key, payload); } public HoodieRecord generateUpdateRecord(HoodieKey key, String instantTime) throws IOException { - return new HoodieRecord(key, generateRandomValue(key, instantTime)); + return new HoodieAvroRecord(key, generateRandomValue(key, instantTime)); } public List generateUpdates(String instantTime, List baseRecords) throws IOException { @@ -615,7 +638,7 @@ public List generateUpdates(String instantTime, List public List generateUpdatesWithTS(String instantTime, List baseRecords, int ts) throws IOException { List updates = new ArrayList<>(); for (HoodieRecord baseRecord : baseRecords) { - HoodieRecord record = new HoodieRecord(baseRecord.getKey(), + HoodieRecord record = new HoodieAvroRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), instantTime, false, ts)); updates.add(record); } @@ -653,7 +676,7 @@ public List generateUpdates(String instantTime, Integer n) throws for (int i = 0; i < n; i++) { Map existingKeys = existingKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); Integer numExistingKeys = numKeysBySchema.get(TRIP_EXAMPLE_SCHEMA); - KeyPartition kp = existingKeys.get(RAND.nextInt(numExistingKeys - 1)); + KeyPartition kp = existingKeys.get(rand.nextInt(numExistingKeys - 1)); HoodieRecord record = generateUpdateRecord(kp.key, instantTime); updates.add(record); } @@ -725,7 +748,7 @@ public Stream generateUniqueUpdatesStream(String instantTime, Inte } return IntStream.range(0, n).boxed().map(i -> { - int index = numExistingKeys == 1 ? 0 : RAND.nextInt(numExistingKeys - 1); + int index = numExistingKeys == 1 ? 0 : rand.nextInt(numExistingKeys - 1); KeyPartition kp = existingKeys.get(index); // Find the available keyPartition starting from randomly chosen one. while (used.contains(kp)) { @@ -735,7 +758,7 @@ public Stream generateUniqueUpdatesStream(String instantTime, Inte logger.debug("key getting updated: " + kp.key.getRecordKey()); used.add(kp); try { - return new HoodieRecord(kp.key, generateRandomValueAsPerSchema(schemaStr, kp.key, instantTime, false)); + return new HoodieAvroRecord(kp.key, generateRandomValueAsPerSchema(schemaStr, kp.key, instantTime, false)); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -758,7 +781,7 @@ public Stream generateUniqueDeleteStream(Integer n) { List result = new ArrayList<>(); for (int i = 0; i < n; i++) { - int index = RAND.nextInt(numExistingKeys); + int index = rand.nextInt(numExistingKeys); while (!existingKeys.containsKey(index)) { index = (index + 1) % numExistingKeys; } @@ -790,7 +813,7 @@ public Stream generateUniqueDeleteRecordStream(String instantTime, List result = new ArrayList<>(); for (int i = 0; i < n; i++) { - int index = RAND.nextInt(numExistingKeys); + int index = rand.nextInt(numExistingKeys); while (!existingKeys.containsKey(index)) { index = (index + 1) % numExistingKeys; } @@ -801,7 +824,7 @@ public Stream generateUniqueDeleteRecordStream(String instantTime, numExistingKeys--; used.add(kp); try { - result.add(new HoodieRecord(kp.key, generateRandomDeleteValue(kp.key, instantTime))); + result.add(new HoodieAvroRecord(kp.key, generateRandomDeleteValue(kp.key, instantTime))); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -840,8 +863,8 @@ public boolean deleteExistingKeyIfPresent(HoodieKey key) { public List generateGenericRecords(int numRecords) { List list = new ArrayList<>(); IntStream.range(0, numRecords).forEach(i -> { - list.add(generateGenericRecord(UUID.randomUUID().toString(), "0", UUID.randomUUID().toString(), UUID.randomUUID() - .toString(), RAND.nextLong())); + list.add(generateGenericRecord(genPseudoRandomUUID(rand).toString(), "0", + genPseudoRandomUUID(rand).toString(), genPseudoRandomUUID(rand).toString(), rand.nextLong())); }); return list; } @@ -864,4 +887,31 @@ public static class KeyPartition implements Serializable { public void close() { existingKeysBySchema.clear(); } + + private static long genRandomTimeMillis(Random r) { + // Fri Feb 13 15:31:30 PST 2009 + long anchorTs = 1234567890L; + // NOTE: To provide for certainty and not generate overly random dates, we will limit + // dispersion to be w/in +/- 3 days from the anchor date + return anchorTs + r.nextLong() % 259200000L; + } + + private static UUID genPseudoRandomUUID(Random r) { + byte[] bytes = new byte[16]; + r.nextBytes(bytes); + + bytes[6] &= 0x0f; + bytes[6] |= 0x40; + bytes[8] &= 0x3f; + bytes[8] |= 0x80; + + try { + Constructor ctor = UUID.class.getDeclaredConstructor(byte[].class); + ctor.setAccessible(true); + return ctor.newInstance((Object) bytes); + } catch (InvocationTargetException | InstantiationException | IllegalAccessException | NoSuchMethodException e) { + logger.info("Failed to generate pseudo-random UUID!"); + throw new HoodieException(e); + } + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java index 7b8148a612a8b..f78312217eec2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestTable.java @@ -354,7 +354,6 @@ public HoodieRollbackMetadata getRollbackMetadata(String instantTimeToDelete, Ma rollbackPartitionMetadata.setPartitionPath(entry.getKey()); rollbackPartitionMetadata.setSuccessDeleteFiles(entry.getValue()); rollbackPartitionMetadata.setFailedDeleteFiles(new ArrayList<>()); - rollbackPartitionMetadata.setWrittenLogFiles(getWrittenLogFiles(instantTimeToDelete, entry)); long rollbackLogFileSize = 50 + RANDOM.nextInt(500); String fileId = UUID.randomUUID().toString(); String logFileName = logFileName(instantTimeToDelete, fileId, 0); @@ -1045,7 +1044,7 @@ private static HoodieTestTableState getTestTableStateWithPartitionFileInfo(Write return testTableState; } - private static List generateHoodieWriteStatForPartition(Map>> partitionToFileIdMap, + public static List generateHoodieWriteStatForPartition(Map>> partitionToFileIdMap, String commitTime, boolean bootstrap) { List writeStats = new ArrayList<>(); for (Map.Entry>> entry : partitionToFileIdMap.entrySet()) { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java index c623c2f5df590..f9c9898f20192 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java @@ -18,6 +18,7 @@ package org.apache.hudi.common.testutils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieTableType; @@ -25,6 +26,7 @@ import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.metadata.HoodieTableMetadata; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; @@ -176,4 +178,17 @@ public static List generateFakeHoodieWriteStat(int limit) { } return writeStatList; } + + public static void createCompactionCommitInMetadataTable( + Configuration hadoopConf, HoodieWrapperFileSystem wrapperFs, String basePath, + String instantTime) throws IOException { + // This is to simulate a completed compaction commit in metadata table timeline, + // so that the commits on data table timeline can be archived + // Note that, if metadata table is enabled, instants in data table timeline, + // which are more recent than the last compaction on the metadata table, + // are not archived (HoodieTimelineArchiveLog::getInstantsToArchive) + String metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath); + HoodieTestUtils.init(hadoopConf, metadataTableBasePath, HoodieTableType.MERGE_ON_READ); + HoodieTestDataGenerator.createCommitFile(metadataTableBasePath, instantTime + "001", wrapperFs.getConf()); + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java index 8bd10823dacf2..c052b63ab544b 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java @@ -80,6 +80,23 @@ public RawTripTestPayload(String jsonData) throws IOException { this.isDeleted = false; } + /** + * @deprecated PLEASE READ THIS CAREFULLY + * + * Converting properly typed schemas into JSON leads to inevitable information loss, since JSON + * encodes only representation of the record (with no schema accompanying it), therefore occasionally + * losing nuances of the original data-types provided by the schema (for ex, with 1.23 literal it's + * impossible to tell whether original type was Double or Decimal). + * + * Multiplied by the fact that Spark 2 JSON schema inference has substantial gaps in it (see below), + * it's **NOT RECOMMENDED** to use this method. Instead please consider using {@link AvroConversionUtils#createDataframe()} + * method accepting list of {@link HoodieRecord} (as produced by the {@link HoodieTestDataGenerator} + * to create Spark's {@code Dataframe}s directly. + * + * REFs + * https://medium.com/swlh/notes-about-json-schema-handling-in-spark-sql-be1e7f13839d + */ + @Deprecated public static List recordsToStrings(List records) { return records.stream().map(RawTripTestPayload::recordToString).filter(Option::isPresent).map(Option::get) .collect(Collectors.toList()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java index bd1e3b764e1bc..ab77caa1bcb83 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java @@ -21,6 +21,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.MercifulJsonConverter; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -71,6 +72,10 @@ public static List generateTestRecords(int from, int limit) throw return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit); } + public static List generateTestGenericRecords(int from, int limit) throws IOException, URISyntaxException { + return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit); + } + public static List generateTestJsonRecords(int from, int limit) throws IOException, URISyntaxException { Path dataPath = initializeSampleDataPath(); @@ -81,9 +86,9 @@ public static List generateTestJsonRecords(int from, int limit) throws I } } - private static List toRecords(Schema writerSchema, Schema readerSchema, int from, int limit) + private static List toRecords(Schema writerSchema, Schema readerSchema, int from, int limit) throws IOException, URISyntaxException { - GenericDatumReader reader = new GenericDatumReader<>(writerSchema, readerSchema); + GenericDatumReader reader = new GenericDatumReader<>(writerSchema, readerSchema); Path dataPath = initializeSampleDataPath(); try (Stream stream = Files.lines(dataPath)) { @@ -148,7 +153,7 @@ public static List generateHoodieTestRecords(int from, int limit, } private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) { - return new HoodieRecord<>(new HoodieKey(key, partitionPath), + return new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) iRecord))); } @@ -168,7 +173,7 @@ public static List generateHoodieTestRecordsWithoutHoodieMetadata( throws IOException, URISyntaxException { List iRecords = generateTestRecords(from, limit); - return iRecords.stream().map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + return iRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new HoodieAvroPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); } @@ -176,9 +181,9 @@ public static List updateHoodieTestRecordsWithoutHoodieMetadata(Li Schema schema, String fieldNameToUpdate, String newValue) { return oldRecords.stream().map(r -> { try { - GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get(); + GenericRecord rec = (GenericRecord) ((HoodieAvroRecord) r).getData().getInsertValue(schema).get(); rec.put(fieldNameToUpdate, newValue); - return new HoodieRecord<>(r.getKey(), new HoodieAvroPayload(Option.of(rec))); + return new HoodieAvroRecord<>(r.getKey(), new HoodieAvroPayload(Option.of(rec))); } catch (IOException io) { throw new HoodieIOException("unable to get data from hoodie record", io); } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SpillableMapTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SpillableMapTestUtils.java index 89155904ec605..2e450660b5a4c 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SpillableMapTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SpillableMapTestUtils.java @@ -19,6 +19,7 @@ package org.apache.hudi.common.testutils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -48,7 +49,7 @@ public static List upsertRecords(List iRecords, String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); recordKeys.add(key); HoodieRecord record = - new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); + new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); record.unseal(); record.setCurrentLocation(new HoodieRecordLocation("DUMMY_COMMIT_TIME", "DUMMY_FILE_ID")); record.seal(); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java index e07c0fad3d24e..7bef8477125c2 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestParquetUtils.java @@ -147,7 +147,7 @@ public void testFetchRecordKeyPartitionPathFromParquet(String typeCode) throws E // Read and verify List fetchedRows = - parquetUtils.fetchRecordKeyPartitionPath(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)); + parquetUtils.fetchHoodieKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)); assertEquals(rowKeys.size(), fetchedRows.size(), "Total count does not match"); for (HoodieKey entry : fetchedRows) { @@ -173,7 +173,7 @@ public void testFetchRecordKeyPartitionPathVirtualKeysFromParquet() throws Excep // Read and verify List fetchedRows = - parquetUtils.fetchRecordKeyPartitionPath(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath), + parquetUtils.fetchHoodieKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath), Option.of(new TestBaseKeyGen("abc","def"))); assertEquals(rowKeys.size(), fetchedRows.size(), "Total count does not match"); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestBitCaskDiskMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestBitCaskDiskMap.java index 208c6d96995b8..9bbe4277162e0 100755 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestBitCaskDiskMap.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestBitCaskDiskMap.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -186,7 +187,7 @@ public void testSizeEstimator() throws IOException, URISyntaxException { schema = SchemaTestUtil.getSimpleSchema(); List indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); hoodieRecords = - indexedRecords.stream().map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + indexedRecords.stream().map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload(Option.of((GenericRecord) r)))).collect(Collectors.toList()); payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), new HoodieRecordSizeEstimator(schema)); assertTrue(payloadSize > 0); @@ -195,7 +196,7 @@ public void testSizeEstimator() throws IOException, URISyntaxException { final Schema simpleSchemaWithMetadata = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); hoodieRecords = indexedRecords.stream() - .map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + .map(r -> new HoodieAvroRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new AvroBinaryTestPayload( Option.of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))) .collect(Collectors.toList()); @@ -212,7 +213,7 @@ public void testPutAll(boolean isCompressionEnabled) throws IOException, URISynt iRecords.forEach(r -> { String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieRecord value = new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); + HoodieRecord value = new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); recordMap.put(key, value); }); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java index f7b45e9d839b6..e33baf1493a93 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestExternalSpillableMap.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -135,7 +136,7 @@ public void testSimpleUpsert(ExternalSpillableMap.DiskMapType diskMapType, boole updatedRecords.forEach(record -> { HoodieRecord rec = records.get(((GenericRecord) record).get(HoodieRecord.RECORD_KEY_METADATA_FIELD)); try { - assertEquals(rec.getData().getInsertValue(schema).get(), record); + assertEquals(((HoodieAvroRecord) rec).getData().getInsertValue(schema).get(), record); } catch (IOException io) { throw new UncheckedIOException(io); } @@ -159,13 +160,13 @@ public void testAllMapOperations(ExternalSpillableMap.DiskMapType diskMapType, b IndexedRecord inMemoryRecord = iRecords.get(0); String ikey = ((GenericRecord) inMemoryRecord).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String iPartitionPath = ((GenericRecord) inMemoryRecord).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieRecord inMemoryHoodieRecord = new HoodieRecord<>(new HoodieKey(ikey, iPartitionPath), + HoodieRecord inMemoryHoodieRecord = new HoodieAvroRecord<>(new HoodieKey(ikey, iPartitionPath), new HoodieAvroPayload(Option.of((GenericRecord) inMemoryRecord))); IndexedRecord onDiskRecord = iRecords.get(99); String dkey = ((GenericRecord) onDiskRecord).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String dPartitionPath = ((GenericRecord) onDiskRecord).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieRecord onDiskHoodieRecord = new HoodieRecord<>(new HoodieKey(dkey, dPartitionPath), + HoodieRecord onDiskHoodieRecord = new HoodieAvroRecord<>(new HoodieKey(dkey, dPartitionPath), new HoodieAvroPayload(Option.of((GenericRecord) onDiskRecord))); // assert size assert records.size() == 100; @@ -241,7 +242,7 @@ public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk(ExternalSpillable // Get a record from the in-Memory map String key = recordKeys.get(0); - HoodieRecord record = records.get(key); + HoodieAvroRecord record = (HoodieAvroRecord) records.get(key); List recordsToUpdate = new ArrayList<>(); recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get()); @@ -259,7 +260,7 @@ public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk(ExternalSpillable // Get a record from the disk based map key = recordKeys.get(recordKeys.size() - 1); - record = records.get(key); + record = (HoodieAvroRecord) records.get(key); recordsToUpdate = new ArrayList<>(); recordsToUpdate.add((IndexedRecord) record.getData().getInsertValue(schema).get()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbDiskMap.java b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbDiskMap.java index 2ae521fc8c217..31daaab213604 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbDiskMap.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/collection/TestRocksDbDiskMap.java @@ -18,12 +18,9 @@ package org.apache.hudi.common.util.collection; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.generic.IndexedRecord; - import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -33,6 +30,9 @@ import org.apache.hudi.common.testutils.SpillableMapTestUtils; import org.apache.hudi.common.util.Option; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -166,7 +166,7 @@ public void testPutAll() throws IOException, URISyntaxException { iRecords.forEach(r -> { String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); - HoodieRecord value = new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); + HoodieRecord value = new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) r))); recordMap.put(key, value); }); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/io/TestByteBufferBackedInputStream.java b/hudi-common/src/test/java/org/apache/hudi/common/util/io/TestByteBufferBackedInputStream.java new file mode 100644 index 0000000000000..87bd2eea2ebe5 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/io/TestByteBufferBackedInputStream.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.util.io; + +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestByteBufferBackedInputStream { + + @Test + public void testConstructor() { + byte[] bytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xE, 0xE, 0xD }; + ByteBuffer byteBuf = ByteBuffer.wrap(bytes, 0, 1); + ByteBuffer byteBufClone = byteBuf.duplicate(); + + // ByteBuffer ctor + ByteBufferBackedInputStream first = new ByteBufferBackedInputStream(byteBuf); + + assertEquals(first.read(), 0xD); + assertThrows(IllegalArgumentException.class, first::read); + // Make sure that the original buffer stays intact + assertEquals(byteBufClone, byteBuf); + + // byte[] ctor + ByteBufferBackedInputStream second = new ByteBufferBackedInputStream(bytes); + + assertEquals(second.read(), 0xD); + + // byte[] ctor (w/ offset) + ByteBufferBackedInputStream third = new ByteBufferBackedInputStream(bytes, 1, 1); + + assertEquals(third.read(), 0xE); + assertThrows(IllegalArgumentException.class, third::read); + } + + @Test + public void testRead() { + byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xE, 0xE, 0xD }; + + ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes); + + int firstByte = stream.read(); + assertEquals(firstByte, 0xD); + + byte[] readBytes = new byte[4]; + int read = stream.read(readBytes, 1, 3); + + assertEquals(3, read); + assertArrayEquals(new byte[]{0, 0xE, 0xA, 0xD}, readBytes); + assertEquals(4, stream.getPosition()); + } + + @Test + public void testSeek() { + byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xA, 0xE, 0xD }; + + ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes, 1, 7); + + // Seek to 2 byte in the stream (3 in the original buffer) + stream.seek(1); + int firstRead = stream.read(); + assertEquals(0xA, firstRead); + + // Seek to 5 byte in the stream (6 in the original buffer) + stream.seek(5); + int secondRead = stream.read(); + assertEquals(0xE, secondRead); + + // Try to seek past the stream boundary + assertThrows(IllegalArgumentException.class, () -> stream.seek(8)); + } + + @Test + public void testCopyFrom() { + byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xA, 0xE, 0xD }; + + ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes); + + int firstByte = stream.read(); + assertEquals(firstByte, 0xD); + + // Copy 5 byes from the stream (while keeping stream's state intact) + byte[] targetBytes = new byte[5]; + stream.copyFrom(2, targetBytes, 0, targetBytes.length); + + assertArrayEquals(new byte[] { 0xA, 0xD, 0xD, 0xA, 0xE }, targetBytes); + + // Continue reading the stream from where we left of (before copying) + int secondByte = stream.read(); + assertEquals(secondByte, 0xE); + } +} diff --git a/hudi-common/src/test/resources/timestamp-test-evolved.avsc b/hudi-common/src/test/resources/timestamp-test-evolved.avsc index beb36329eabac..7a52ca6f245e1 100644 --- a/hudi-common/src/test/resources/timestamp-test-evolved.avsc +++ b/hudi-common/src/test/resources/timestamp-test-evolved.avsc @@ -20,7 +20,43 @@ "type": "record", "name": "User", "fields": [ - {"name": "field1", "type": ["null", "string"], "default": null}, - {"name": "createTime", "type": ["null", "long"], "default": null} + { + "name": "field1", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "createTime", + "type": [ + "null", + "long" + ], + "default": null + }, + { + "name": "createTimeString", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "createTimeDecimal", + "type": [ + "null", + { + "name": "decimalFixed", + "type": "fixed", + "logicalType": "decimal", + "precision": 20, + "scale": 4, + "size": 10 + } + ] + } ] } \ No newline at end of file diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java b/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java index 71c6408ccb2cd..78df2e78e7081 100644 --- a/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java +++ b/hudi-examples/src/main/java/org/apache/hudi/examples/common/HoodieExampleDataGenerator.java @@ -20,6 +20,7 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -126,7 +127,7 @@ public Stream> generateInsertsStream(String commitTime, Integer kp.partitionPath = partitionPath; existingKeys.put(currSize + i, kp); numExistingKeys++; - return new HoodieRecord<>(key, generateRandomValue(key, commitTime)); + return new HoodieAvroRecord<>(key, generateRandomValue(key, commitTime)); }); } @@ -149,7 +150,7 @@ public List> generateUpdates(String commitTime, Integer n) { } public HoodieRecord generateUpdateRecord(HoodieKey key, String commitTime) { - return new HoodieRecord<>(key, generateRandomValue(key, commitTime)); + return new HoodieAvroRecord<>(key, generateRandomValue(key, commitTime)); } private Option convertToString(HoodieRecord record) { diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java b/hudi-examples/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java index 587f73b0f7fd4..4890a6529a52c 100644 --- a/hudi-examples/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java +++ b/hudi-examples/src/main/java/org/apache/hudi/examples/java/HoodieJavaWriteClientExample.java @@ -18,11 +18,11 @@ package org.apache.hudi.examples.java; -import org.apache.hadoop.conf.Configuration; import org.apache.hudi.client.HoodieJavaWriteClient; import org.apache.hudi.client.common.HoodieJavaEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -33,6 +33,7 @@ import org.apache.hudi.examples.common.HoodieExampleDataGenerator; import org.apache.hudi.index.HoodieIndex; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; @@ -95,7 +96,7 @@ public static void main(String[] args) throws Exception { List> records = dataGen.generateInserts(newCommitTime, 10); List> recordsSoFar = new ArrayList<>(records); List> writeRecords = - recordsSoFar.stream().map(r -> new HoodieRecord(r)).collect(Collectors.toList()); + recordsSoFar.stream().map(r -> new HoodieAvroRecord(r)).collect(Collectors.toList()); client.insert(writeRecords, newCommitTime); // updates @@ -105,7 +106,7 @@ public static void main(String[] args) throws Exception { records.addAll(toBeUpdated); recordsSoFar.addAll(toBeUpdated); writeRecords = - recordsSoFar.stream().map(r -> new HoodieRecord(r)).collect(Collectors.toList()); + recordsSoFar.stream().map(r -> new HoodieAvroRecord(r)).collect(Collectors.toList()); client.upsert(writeRecords, newCommitTime); // Delete diff --git a/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java b/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java index 35e46605f17b2..1afc180531a16 100644 --- a/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java +++ b/hudi-examples/src/main/java/org/apache/hudi/examples/spark/HoodieWriteClientExample.java @@ -38,6 +38,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hudi.table.action.HoodieWriteMetadata; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; @@ -140,8 +142,8 @@ public static void main(String[] args) throws Exception { // compaction if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) { Option instant = client.scheduleCompaction(Option.empty()); - JavaRDD writeStatues = client.compact(instant.get()); - client.commitCompaction(instant.get(), writeStatues, Option.empty()); + HoodieWriteMetadata> compactionMetadata = client.compact(instant.get()); + client.commitCompaction(instant.get(), compactionMetadata.getCommitMetadata().get(), Option.empty()); } } diff --git a/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala b/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala index cb221fcef3b2a..77b3885e3cf7a 100644 --- a/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala +++ b/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieDataSourceExample.scala @@ -82,7 +82,7 @@ object HoodieDataSourceExample { option(PARTITIONPATH_FIELD.key, "partitionpath"). option(TBL_NAME.key, tableName). mode(Overwrite). - save(tablePath) + save(tablePath) } /** @@ -127,7 +127,7 @@ object HoodieDataSourceExample { option(PARTITIONPATH_FIELD.key, "partitionpath"). option(TBL_NAME.key, tableName). mode(Append). - save(tablePath) + save(tablePath) } /** diff --git a/hudi-flink/pom.xml b/hudi-flink/pom.xml index c8fac38be5b18..27a4a0b453cb7 100644 --- a/hudi-flink/pom.xml +++ b/hudi-flink/pom.xml @@ -164,13 +164,13 @@ org.apache.flink - flink-table-runtime-blink_${scala.binary.version} + flink-table-runtime_${scala.binary.version} ${flink.version} provided org.apache.flink - flink-table-planner-blink_${scala.binary.version} + flink-table-planner_${scala.binary.version} ${flink.version} provided @@ -307,7 +307,7 @@ org.apache.flink - flink-runtime_${scala.binary.version} + flink-runtime ${flink.version} test test-jar @@ -321,7 +321,7 @@ org.apache.flink - flink-table-runtime-blink_${scala.binary.version} + flink-table-runtime_${scala.binary.version} ${flink.version} test test-jar diff --git a/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java index 77c3f15e54c45..1be90603605cd 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java +++ b/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java @@ -23,9 +23,11 @@ import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorType; @@ -106,6 +108,12 @@ private FlinkOptions() { // ------------------------------------------------------------------------ // Index Options // ------------------------------------------------------------------------ + public static final ConfigOption INDEX_TYPE = ConfigOptions + .key("index.type") + .stringType() + .defaultValue(HoodieIndex.IndexType.FLINK_STATE.name()) + .withDescription("Index type of Flink write job, default is using state backed index."); + public static final ConfigOption INDEX_BOOTSTRAP_ENABLED = ConfigOptions .key("index.bootstrap.enabled") .booleanType() @@ -310,6 +318,20 @@ private FlinkOptions() { + "Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using " + "the dot notation eg: `a.b.c`"); + public static final ConfigOption INDEX_KEY_FIELD = ConfigOptions + .key(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()) + .stringType() + .defaultValue("") + .withDescription("Index key field. Value to be used as hashing to find the bucket ID. Should be a subset of or equal to the recordKey fields.\n" + + "Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using " + + "the dot notation eg: `a.b.c`"); + + public static final ConfigOption BUCKET_INDEX_NUM_BUCKETS = ConfigOptions + .key(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key()) + .intType() + .defaultValue(4) // default 4 buckets per partition + .withDescription("Hudi bucket number per partition. Only affected if using Hudi bucket index."); + public static final ConfigOption PARTITION_PATH_FIELD = ConfigOptions .key(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()) .stringType() diff --git a/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java b/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java index acb4af61110fa..6ebf09069be60 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java +++ b/hudi-flink/src/main/java/org/apache/hudi/configuration/OptionsResolver.java @@ -20,6 +20,7 @@ import org.apache.hudi.common.model.DefaultHoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.format.FilePathUtils; import org.apache.flink.configuration.Configuration; @@ -101,6 +102,10 @@ public static boolean isPartitionedTable(Configuration conf) { return FilePathUtils.extractPartitionKeys(conf).length > 0; } + public static boolean isBucketIndexType(Configuration conf) { + return conf.getString(FlinkOptions.INDEX_TYPE).equals(HoodieIndex.IndexType.BUCKET.name()); + } + /** * Returns whether the source should emit changelog. * diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/BucketStreamWriteFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/BucketStreamWriteFunction.java new file mode 100644 index 0000000000000..128358096cde6 --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/BucketStreamWriteFunction.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.FunctionInitializationContext; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.util.Collector; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordLocation; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.configuration.FlinkOptions; +import org.apache.hudi.index.bucket.BucketIdentifier; +import org.apache.hudi.table.HoodieFlinkTable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; + +import static java.util.stream.Collectors.toList; + +/** + * A stream write function with bucket hash index. + * + *

The task holds a fresh new local index: {(partition + bucket number) &rarr fileId} mapping, this index + * is used for deciding whether the incoming records in an UPDATE or INSERT. + * The index is local because different partition paths have separate items in the index. + * + * @param the input type + */ +public class BucketStreamWriteFunction extends StreamWriteFunction { + + private static final Logger LOG = LoggerFactory.getLogger(BucketStreamWriteFunction.class); + + private int maxParallelism; + + private int parallelism; + + private int bucketNum; + + private transient HoodieFlinkTable table; + + private String indexKeyFields; + + private final HashMap bucketToFileIDMap; + + /** + * Constructs a BucketStreamWriteFunction. + * + * @param config The config options + */ + public BucketStreamWriteFunction(Configuration config) { + super(config); + this.bucketToFileIDMap = new HashMap<>(); + } + + @Override + public void open(Configuration parameters) throws IOException { + super.open(parameters); + this.bucketNum = config.getInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS); + this.indexKeyFields = config.getString(FlinkOptions.INDEX_KEY_FIELD); + this.taskID = getRuntimeContext().getIndexOfThisSubtask(); + this.parallelism = getRuntimeContext().getNumberOfParallelSubtasks(); + this.maxParallelism = getRuntimeContext().getMaxNumberOfParallelSubtasks(); + bootstrapIndex(); + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + super.initializeState(context); + this.table = this.writeClient.getHoodieTable(); + } + + @Override + public void processElement(I i, ProcessFunction.Context context, Collector collector) throws Exception { + HoodieRecord record = (HoodieRecord) i; + final HoodieKey hoodieKey = record.getKey(); + final HoodieRecordLocation location; + + final int bucketNum = BucketIdentifier.getBucketId(hoodieKey, indexKeyFields, this.bucketNum); + final String partitionBucketId = BucketIdentifier.partitionBucketIdStr(hoodieKey.getPartitionPath(), bucketNum); + + if (bucketToFileIDMap.containsKey(partitionBucketId)) { + location = new HoodieRecordLocation("U", bucketToFileIDMap.get(partitionBucketId)); + } else { + String newFileId = BucketIdentifier.newBucketFileIdPrefix(bucketNum); + location = new HoodieRecordLocation("I", newFileId); + bucketToFileIDMap.put(partitionBucketId, newFileId); + } + record.unseal(); + record.setCurrentLocation(location); + record.seal(); + bufferRecord(record); + } + + /** + * Get partition_bucket -> fileID mapping from the existing hudi table. + * This is a required operation for each restart to avoid having duplicate file ids for one bucket. + */ + private void bootstrapIndex() throws IOException { + Option latestCommitTime = table.getFileSystemView().getTimeline().filterCompletedInstants().lastInstant(); + if (!latestCommitTime.isPresent()) { + return; + } + // bootstrap bucket info from existing file system + // bucketNum % totalParallelism == this taskID belongs to this task + HashSet bucketToLoad = new HashSet<>(); + for (int i = 0; i < bucketNum; i++) { + int partitionOfBucket = BucketIdentifier.mod(i, parallelism); + if (partitionOfBucket == taskID) { + LOG.info(String.format("Bootstrapping index. Adding bucket %s , " + + "Current parallelism: %s , Max parallelism: %s , Current task id: %s", + i, parallelism, maxParallelism, taskID)); + bucketToLoad.add(i); + } + } + bucketToLoad.forEach(bucket -> LOG.info(String.format("bucketToLoad contains %s", bucket))); + + LOG.info(String.format("Loading Hoodie Table %s, with path %s", table.getMetaClient().getTableConfig().getTableName(), + table.getMetaClient().getBasePath())); + + // Iterate through all existing partitions to load existing fileID belongs to this task + List partitions = table.getMetadata().getAllPartitionPaths(); + for (String partitionPath : partitions) { + List latestFileSlices = table.getSliceView() + .getLatestFileSlices(partitionPath) + .collect(toList()); + for (FileSlice fileslice : latestFileSlices) { + String fileID = fileslice.getFileId(); + int bucketNumber = BucketIdentifier.bucketIdFromFileId(fileID); + if (bucketToLoad.contains(bucketNumber)) { + String partitionBucketId = BucketIdentifier.partitionBucketIdStr(partitionPath, bucketNumber); + LOG.info(String.format("Should load this partition bucket %s with fileID %s", partitionBucketId, fileID)); + if (bucketToFileIDMap.containsKey(partitionBucketId)) { + throw new RuntimeException(String.format("Duplicate fileID %s from partitionBucket %s found " + + "during the BucketStreamWriteFunction index bootstrap.", fileID, partitionBucketId)); + } else { + LOG.info(String.format("Adding fileID %s to the partition bucket %s.", fileID, partitionBucketId)); + bucketToFileIDMap.put(partitionBucketId, fileID); + } + } + } + } + } +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/BucketStreamWriteOperator.java b/hudi-flink/src/main/java/org/apache/hudi/sink/BucketStreamWriteOperator.java new file mode 100644 index 0000000000000..209fe59e4b8ca --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/BucketStreamWriteOperator.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink; + +import org.apache.flink.configuration.Configuration; +import org.apache.hudi.sink.common.AbstractWriteOperator; +import org.apache.hudi.sink.common.WriteOperatorFactory; + +/** + * Operator for {@link BucketStreamWriteFunction}. + * + * @param The input type + */ +public class BucketStreamWriteOperator extends AbstractWriteOperator { + + public BucketStreamWriteOperator(Configuration conf) { + super(new BucketStreamWriteFunction<>(conf)); + } + + public static WriteOperatorFactory getFactory(Configuration conf) { + return WriteOperatorFactory.instance(conf, new BucketStreamWriteOperator<>(conf)); + } +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java index 0e7e35e7ea328..c2f54dd8aaffe 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteFunction.java @@ -19,6 +19,7 @@ package org.apache.hudi.sink; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; @@ -217,13 +218,13 @@ public static DataItem fromHoodieRecord(HoodieRecord record) { return new DataItem( record.getRecordKey(), record.getCurrentLocation().getInstantTime(), - record.getData(), + ((HoodieAvroRecord) record).getData(), record.getOperation()); } public HoodieRecord toHoodieRecord(String partitionPath) { HoodieKey hoodieKey = new HoodieKey(this.key, partitionPath); - HoodieRecord record = new HoodieRecord<>(hoodieKey, data, operation); + HoodieRecord record = new HoodieAvroRecord<>(hoodieKey, data, operation); HoodieRecordLocation loc = new HoodieRecordLocation(instant, null); record.setCurrentLocation(loc); return record; @@ -264,7 +265,7 @@ public List writeBuffer() { public void preWrite(List records) { // rewrite the first record with expected fileID HoodieRecord first = records.get(0); - HoodieRecord record = new HoodieRecord<>(first.getKey(), first.getData(), first.getOperation()); + HoodieRecord record = new HoodieAvroRecord<>(first.getKey(), (HoodieRecordPayload) first.getData(), first.getOperation()); HoodieRecordLocation newLoc = new HoodieRecordLocation(first.getCurrentLocation().getInstantTime(), fileID); record.setCurrentLocation(newLoc); @@ -370,7 +371,7 @@ private String getBucketID(HoodieRecord record) { * * @param value HoodieRecord */ - private void bufferRecord(HoodieRecord value) { + protected void bufferRecord(HoodieRecord value) { final String bucketID = getBucketID(value); DataBucket bucket = this.buckets.computeIfAbsent(bucketID, diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java index 447cfa420a3f4..4782070e33760 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/StreamWriteOperatorCoordinator.java @@ -415,13 +415,11 @@ private void sendCommitAckEvents(long checkpointId) { CompletableFuture[] futures = Arrays.stream(this.gateways).filter(Objects::nonNull) .map(gw -> gw.sendEvent(CommitAckEvent.getInstance(checkpointId))) .toArray(CompletableFuture[]::new); - try { - CompletableFuture.allOf(futures).get(); - } catch (Throwable throwable) { - if (!sendToFinishedTasks(throwable)) { - throw new HoodieException("Error while waiting for the commit ack events to finish sending", throwable); + CompletableFuture.allOf(futures).whenComplete((resp, error) -> { + if (!sendToFinishedTasks(error)) { + throw new HoodieException("Error while waiting for the commit ack events to finish sending", error); } - } + }); } /** diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java index 4832f18bf7f04..0f944c56577e2 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/BootstrapOperator.java @@ -20,6 +20,7 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordGlobalLocation; @@ -29,6 +30,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.BaseFileUtils; +import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; @@ -209,17 +211,10 @@ protected void loadRecords(String partitionPath) throws Exception { if (!isValidFile(baseFile.getFileStatus())) { return; } - - final List hoodieKeys; - try { - hoodieKeys = - fileUtils.fetchRecordKeyPartitionPath(this.hadoopConf, new Path(baseFile.getPath())); - } catch (Exception e) { - throw new HoodieException(String.format("Error when loading record keys from file: %s", baseFile), e); - } - - for (HoodieKey hoodieKey : hoodieKeys) { - output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice)))); + try (ClosableIterator iterator = fileUtils.getHoodieKeyIterator(this.hadoopConf, new Path(baseFile.getPath()))) { + iterator.forEachRemaining(hoodieKey -> { + output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice)))); + }); } }); @@ -251,7 +246,7 @@ protected void loadRecords(String partitionPath) throws Exception { @SuppressWarnings("unchecked") public static HoodieRecord generateHoodieRecord(HoodieKey hoodieKey, FileSlice fileSlice) { - HoodieRecord hoodieRecord = new HoodieRecord(hoodieKey, null); + HoodieRecord hoodieRecord = new HoodieAvroRecord(hoodieKey, null); hoodieRecord.setCurrentLocation(new HoodieRecordGlobalLocation(hoodieKey.getPartitionPath(), fileSlice.getBaseInstantTime(), fileSlice.getFileId())); hoodieRecord.seal(); diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/IndexRecord.java b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/IndexRecord.java index 2fe83b71ca98f..edae0389b8aca 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/IndexRecord.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/bootstrap/IndexRecord.java @@ -18,16 +18,22 @@ package org.apache.hudi.sink.bootstrap; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; /** * The index record. */ -public class IndexRecord extends HoodieRecord { +public class IndexRecord extends HoodieAvroRecord { private static final long serialVersionUID = 1L; public IndexRecord(HoodieRecord record) { super(record); } + + @Override + public HoodieRecord newInstance() { + return new IndexRecord<>(this); + } } \ No newline at end of file diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactFunction.java index 560b5ffbad305..a43fcd5ad4bf9 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactFunction.java @@ -21,9 +21,11 @@ import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.CompactionOperation; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.sink.utils.NonThrownExecutor; import org.apache.hudi.table.HoodieFlinkCopyOnWriteTable; import org.apache.hudi.table.action.compact.HoodieFlinkMergeOnReadTableCompactor; +import org.apache.hudi.util.CompactionUtil; import org.apache.hudi.util.StreamerUtil; import org.apache.flink.annotation.VisibleForTesting; @@ -51,7 +53,7 @@ public class CompactFunction extends ProcessFunction writeClient; /** * Whether to execute compaction asynchronously. @@ -89,21 +91,24 @@ public void processElement(CompactionPlanEvent event, Context context, Collector if (asyncCompaction) { // executes the compaction task asynchronously to not block the checkpoint barrier propagate. executor.execute( - () -> doCompaction(instantTime, compactionOperation, collector), + () -> doCompaction(instantTime, compactionOperation, collector, reloadWriteConfig()), (errMsg, t) -> collector.collect(new CompactionCommitEvent(instantTime, compactionOperation.getFileId(), taskID)), "Execute compaction for instant %s from task %d", instantTime, taskID); } else { // executes the compaction task synchronously for batch mode. LOG.info("Execute compaction for instant {} from task {}", instantTime, taskID); - doCompaction(instantTime, compactionOperation, collector); + doCompaction(instantTime, compactionOperation, collector, writeClient.getConfig()); } } - private void doCompaction(String instantTime, CompactionOperation compactionOperation, Collector collector) throws IOException { - HoodieFlinkMergeOnReadTableCompactor compactor = new HoodieFlinkMergeOnReadTableCompactor(); + private void doCompaction(String instantTime, + CompactionOperation compactionOperation, + Collector collector, + HoodieWriteConfig writeConfig) throws IOException { + HoodieFlinkMergeOnReadTableCompactor compactor = new HoodieFlinkMergeOnReadTableCompactor<>(); List writeStatuses = compactor.compact( new HoodieFlinkCopyOnWriteTable<>( - writeClient.getConfig(), + writeConfig, writeClient.getEngineContext(), writeClient.getHoodieTable().getMetaClient()), writeClient.getHoodieTable().getMetaClient(), @@ -114,6 +119,12 @@ private void doCompaction(String instantTime, CompactionOperation compactionOper collector.collect(new CompactionCommitEvent(instantTime, compactionOperation.getFileId(), writeStatuses, taskID)); } + private HoodieWriteConfig reloadWriteConfig() throws Exception { + HoodieWriteConfig writeConfig = writeClient.getConfig(); + CompactionUtil.setAvroSchema(writeConfig, writeClient.getHoodieTable().getMetaClient()); + return writeConfig; + } + @VisibleForTesting public void setExecutor(NonThrownExecutor executor) { this.executor = executor; diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java index 53127359cb986..ecd66936e88c3 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/CompactionCommitSink.java @@ -20,12 +20,15 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.data.HoodieList; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.util.CompactionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.sink.CleanFunction; import org.apache.hudi.table.HoodieFlinkTable; +import org.apache.hudi.table.action.compact.CompactHelpers; import org.apache.hudi.util.CompactionUtil; import org.apache.hudi.util.StreamerUtil; @@ -147,8 +150,11 @@ private void doCommit(String instant, Collection events) .flatMap(Collection::stream) .collect(Collectors.toList()); + HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata( + table, instant, HoodieList.of(statuses), writeClient.getConfig().getSchema()); + // commit the compaction - this.writeClient.commitCompaction(instant, statuses, Option.empty()); + this.writeClient.commitCompaction(instant, metadata, Option.empty()); // Whether to clean up the old log file when compaction if (!conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED)) { diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java index a6161f2c88cf8..546136e416b7f 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/compact/HoodieFlinkCompactor.java @@ -18,7 +18,7 @@ package org.apache.hudi.sink.compact; -import org.apache.hudi.async.HoodieAsyncService; +import org.apache.hudi.async.HoodieAsyncTableService; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.HoodieFlinkWriteClient; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -116,7 +116,7 @@ public static FlinkCompactionConfig getFlinkCompactionConfig(String[] args) { /** * Schedules compaction in service. */ - public static class AsyncCompactionService extends HoodieAsyncService { + public static class AsyncCompactionService extends HoodieAsyncTableService { private static final long serialVersionUID = 1L; /** @@ -173,6 +173,7 @@ public AsyncCompactionService(FlinkCompactionConfig cfg, Configuration conf, Str CompactionUtil.inferChangelogMode(conf, metaClient); this.writeClient = StreamerUtil.createWriteClient(conf); + this.writeConfig = writeClient.getConfig(); this.table = writeClient.getHoodieTable(); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java index 73fd6685539fb..d01db962c9ba4 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketAssignFunction.java @@ -22,6 +22,7 @@ import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.model.BaseAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordGlobalLocation; @@ -180,7 +181,7 @@ private void processRecord(HoodieRecord record, Collector out) throws Exce if (globalIndex) { // if partition path changes, emit a delete record for old partition path, // then update the index state using location with new partition path. - HoodieRecord deleteRecord = new HoodieRecord<>(new HoodieKey(recordKey, oldLoc.getPartitionPath()), + HoodieRecord deleteRecord = new HoodieAvroRecord<>(new HoodieKey(recordKey, oldLoc.getPartitionPath()), payloadCreation.createDeletePayload((BaseAvroPayload) record.getData())); deleteRecord.setCurrentLocation(oldLoc.toLocal("U")); deleteRecord.seal(); diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java new file mode 100644 index 0000000000000..ab46b0317e477 --- /dev/null +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.sink.partitioner; + +import org.apache.flink.api.common.functions.Partitioner; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.index.bucket.BucketIdentifier; + +/** + * Bucket index input partitioner. + * The fields to hash can be a subset of the primary key fields. + * + * @param The type of obj to hash + */ +public class BucketIndexPartitioner implements Partitioner { + + private final int bucketNum; + private final String indexKeyFields; + + public BucketIndexPartitioner(int bucketNum, String indexKeyFields) { + this.bucketNum = bucketNum; + this.indexKeyFields = indexKeyFields; + } + + @Override + public int partition(HoodieKey key, int numPartitions) { + int curBucket = BucketIdentifier.getBucketId(key, indexKeyFields, bucketNum); + return BucketIdentifier.mod(curBucket, numPartitions); + } +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java index 97b6b238814c7..aad775a356423 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/DeltaWriteProfile.java @@ -59,7 +59,7 @@ protected List smallFilesProfile(String partitionPath) { List allSmallFileSlices = new ArrayList<>(); // If we can index log files, we can add more inserts to log files for fileIds including those under // pending compaction. - List allFileSlices = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true) + List allFileSlices = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false) .collect(Collectors.toList()); for (FileSlice fileSlice : allFileSlices) { if (isSmallFile(fileSlice)) { diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java index 84fcd03f0833b..fdb8152b00577 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfile.java @@ -180,7 +180,14 @@ public synchronized List getSmallFiles(String partitionPath) { if (smallFilesMap.containsKey(partitionPath)) { return smallFilesMap.get(partitionPath); } - List smallFiles = smallFilesProfile(partitionPath); + + List smallFiles = new ArrayList<>(); + if (config.getParquetSmallFileLimit() <= 0) { + this.smallFilesMap.put(partitionPath, smallFiles); + return smallFiles; + } + + smallFiles = smallFilesProfile(partitionPath); this.smallFilesMap.put(partitionPath, smallFiles); return smallFiles; } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java index e8aafd830f10f..405522802c368 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/profile/WriteProfiles.java @@ -18,6 +18,10 @@ package org.apache.hudi.sink.partitioner.profile; +import org.apache.flink.core.fs.Path; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hudi.client.common.HoodieFlinkEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -29,11 +33,6 @@ import org.apache.hudi.exception.HoodieException; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.util.StreamerUtil; - -import org.apache.flink.core.fs.Path; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -117,7 +116,7 @@ private static Map getFilesToReadOfInstant( HoodieCommitMetadata metadata, FileSystem fs, HoodieTableType tableType) { - return getFilesToRead(metadata, basePath.toString(), tableType).entrySet().stream() + return getFilesToRead(fs.getConf(), metadata, basePath.toString(), tableType).entrySet().stream() // filter out the file paths that does not exist, some files may be cleaned by // the cleaner. .filter(entry -> { @@ -133,14 +132,16 @@ private static Map getFilesToReadOfInstant( } private static Map getFilesToRead( + Configuration hadoopConf, HoodieCommitMetadata metadata, String basePath, - HoodieTableType tableType) { + HoodieTableType tableType + ) { switch (tableType) { case COPY_ON_WRITE: - return metadata.getFileIdToFileStatus(basePath); + return metadata.getFileIdToFileStatus(hadoopConf, basePath); case MERGE_ON_READ: - return metadata.getFullPathToFileStatus(basePath); + return metadata.getFullPathToFileStatus(hadoopConf, basePath); default: throw new AssertionError(); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java b/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java index b600a5d2f50e4..bfc7d7d62ad45 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunction.java @@ -18,11 +18,11 @@ package org.apache.hudi.sink.transform; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; import org.apache.hudi.sink.utils.PayloadCreation; @@ -87,7 +87,7 @@ public void open(Configuration parameters) throws Exception { this.converter = RowDataToAvroConverters.createConverter(this.rowType); this.keyGenerator = HoodieAvroKeyGeneratorFactory - .createKeyGenerator(flinkConf2TypedProperties(FlinkOptions.flatOptions(this.config))); + .createKeyGenerator(flinkConf2TypedProperties(this.config)); this.payloadCreation = PayloadCreation.instance(config); } @@ -111,6 +111,6 @@ private HoodieRecord toHoodieRecord(I record) throws Exception { HoodieRecordPayload payload = payloadCreation.createPayload(gr); HoodieOperation operation = HoodieOperation.fromValue(record.getRowKind().toByteValue()); - return new HoodieRecord<>(hoodieKey, payload, operation); + return new HoodieAvroRecord<>(hoodieKey, payload, operation); } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctionWithRateLimit.java b/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctionWithRateLimit.java index 4526c6ff9ea98..3d42ad87d908e 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctionWithRateLimit.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/transform/RowDataToHoodieFunctionWithRateLimit.java @@ -22,7 +22,7 @@ import org.apache.hudi.configuration.FlinkOptions; import org.apache.flink.configuration.Configuration; -import org.apache.flink.shaded.guava18.com.google.common.util.concurrent.RateLimiter; +import org.apache.flink.shaded.guava30.com.google.common.util.concurrent.RateLimiter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.logical.RowType; diff --git a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java index 5f156e839f1e3..ae8b4f21300a2 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java +++ b/hudi-flink/src/main/java/org/apache/hudi/sink/utils/Pipelines.java @@ -18,9 +18,11 @@ package org.apache.hudi.sink.utils; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.configuration.OptionsResolver; +import org.apache.hudi.sink.BucketStreamWriteOperator; import org.apache.hudi.sink.CleanFunction; import org.apache.hudi.sink.StreamWriteOperator; import org.apache.hudi.sink.append.AppendWriteOperator; @@ -36,14 +38,18 @@ import org.apache.hudi.sink.compact.CompactionPlanEvent; import org.apache.hudi.sink.compact.CompactionPlanOperator; import org.apache.hudi.sink.partitioner.BucketAssignFunction; +import org.apache.hudi.sink.partitioner.BucketIndexPartitioner; import org.apache.hudi.sink.transform.RowDataToHoodieFunctions; import org.apache.hudi.table.format.FilePathUtils; +import org.apache.flink.api.common.functions.Partitioner; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.configuration.Configuration; +import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSink; import org.apache.flink.streaming.api.functions.sink.SinkFunction; +import org.apache.flink.streaming.api.graph.StreamGraphGenerator; import org.apache.flink.streaming.api.operators.KeyedProcessOperator; import org.apache.flink.streaming.api.operators.ProcessOperator; import org.apache.flink.table.data.RowData; @@ -55,6 +61,31 @@ */ public class Pipelines { + /** + * Bulk insert the input dataset at once. + * + *

By default, the input dataset would shuffle by the partition path first then + * sort by the partition path before passing around to the write function. + * The whole pipeline looks like the following: + * + *

+   *      | input1 | ===\     /=== |sorter| === | task1 | (p1, p2)
+   *                   shuffle
+   *      | input2 | ===/     \=== |sorter| === | task2 | (p3, p4)
+   *
+   *      Note: Both input1 and input2's dataset come from partitions: p1, p2, p3, p4
+   * 
+ * + *

The write task switches to new file handle each time it receives a record + * from the different partition path, the shuffle and sort would reduce small files. + * + *

The bulk insert should be run in batch execution mode. + * + * @param conf The configuration + * @param rowType The input row type + * @param dataStream The input data stream + * @return the bulk insert data stream sink + */ public static DataStreamSink bulkInsert(Configuration conf, RowType rowType, DataStream dataStream) { WriteOperatorFactory operatorFactory = BulkInsertWriteOperator.getFactory(conf, rowType); @@ -64,7 +95,11 @@ public static DataStreamSink bulkInsert(Configuration conf, RowType rowT if (conf.getBoolean(FlinkOptions.WRITE_BULK_INSERT_SHUFFLE_BY_PARTITION)) { // shuffle by partition keys - dataStream = dataStream.keyBy(rowDataKeyGen::getPartitionPath); + // use #partitionCustom instead of #keyBy to avoid duplicate sort operations, + // see BatchExecutionUtils#applyBatchExecutionSettings for details. + Partitioner partitioner = (key, channels) -> + KeyGroupRangeAssignment.assignKeyToParallelOperator(key, StreamGraphGenerator.DEFAULT_LOWER_BOUND_MAX_PARALLELISM, channels); + dataStream = dataStream.partitionCustom(partitioner, rowDataKeyGen::getPartitionPath); } if (conf.getBoolean(FlinkOptions.WRITE_BULK_INSERT_SORT_BY_PARTITION)) { SortOperatorGen sortOperatorGen = new SortOperatorGen(rowType, partitionFields); @@ -88,6 +123,27 @@ public static DataStreamSink bulkInsert(Configuration conf, RowType rowT .name("dummy"); } + /** + * Insert the dataset with append mode(no upsert or deduplication). + * + *

The input dataset would be rebalanced among the write tasks: + * + *

+   *      | input1 | ===\     /=== | task1 | (p1, p2, p3, p4)
+   *                   shuffle
+   *      | input2 | ===/     \=== | task2 | (p1, p2, p3, p4)
+   *
+   *      Note: Both input1 and input2's dataset come from partitions: p1, p2, p3, p4
+   * 
+ * + *

The write task switches to new file handle each time it receives a record + * from the different partition path, so there may be many small files. + * + * @param conf The configuration + * @param rowType The input row type + * @param dataStream The input data stream + * @return the appending data stream sink + */ public static DataStreamSink append(Configuration conf, RowType rowType, DataStream dataStream) { WriteOperatorFactory operatorFactory = AppendWriteOperator.getFactory(conf, rowType); @@ -101,6 +157,8 @@ public static DataStreamSink append(Configuration conf, RowType rowType, /** * Constructs bootstrap pipeline as streaming. + * The bootstrap operator loads the existing data index (primary key to file id mapping), + * then sends the indexing data set to subsequent operator(usually the bucket assign operator). */ public static DataStream bootstrap( Configuration conf, @@ -112,6 +170,8 @@ public static DataStream bootstrap( /** * Constructs bootstrap pipeline. + * The bootstrap operator loads the existing data index (primary key to file id mapping), + * then send the indexing data set to subsequent operator(usually the bucket assign operator). * * @param conf The configuration * @param rowType The row type @@ -128,7 +188,7 @@ public static DataStream bootstrap( boolean bounded, boolean overwrite) { final boolean globalIndex = conf.getBoolean(FlinkOptions.INDEX_GLOBAL_ENABLED); - if (overwrite) { + if (overwrite || OptionsResolver.isBucketIndexType(conf)) { return rowDataToHoodieRecord(conf, rowType, dataStream); } else if (bounded && !globalIndex && OptionsResolver.isPartitionedTable(conf)) { return boundedBootstrap(conf, rowType, defaultParallelism, dataStream); @@ -158,6 +218,11 @@ private static DataStream streamBootstrap( return dataStream1; } + /** + * Constructs bootstrap pipeline for batch execution mode. + * The indexing data set is loaded before the actual data write + * in order to support batch UPSERT. + */ private static DataStream boundedBootstrap( Configuration conf, RowType rowType, @@ -177,28 +242,85 @@ private static DataStream boundedBootstrap( .uid("uid_batch_index_bootstrap_" + conf.getString(FlinkOptions.TABLE_NAME)); } + /** + * Transforms the row data to hoodie records. + */ public static DataStream rowDataToHoodieRecord(Configuration conf, RowType rowType, DataStream dataStream) { return dataStream.map(RowDataToHoodieFunctions.create(rowType, conf), TypeInformation.of(HoodieRecord.class)); } + /** + * The streaming write pipeline. + * + *

The input dataset shuffles by the primary key first then + * shuffles by the file group ID before passing around to the write function. + * The whole pipeline looks like the following: + * + *

+   *      | input1 | ===\     /=== | bucket assigner | ===\     /=== | task1 |
+   *                   shuffle(by PK)                    shuffle(by bucket ID)
+   *      | input2 | ===/     \=== | bucket assigner | ===/     \=== | task2 |
+   *
+   *      Note: a file group must be handled by one write task to avoid write conflict.
+   * 
+ * + *

The bucket assigner assigns the inputs to suitable file groups, the write task caches + * and flushes the data set to disk. + * + * @param conf The configuration + * @param defaultParallelism The default parallelism + * @param dataStream The input data stream + * @return the stream write data stream pipeline + */ public static DataStream hoodieStreamWrite(Configuration conf, int defaultParallelism, DataStream dataStream) { - WriteOperatorFactory operatorFactory = StreamWriteOperator.getFactory(conf); - return dataStream + if (OptionsResolver.isBucketIndexType(conf)) { + WriteOperatorFactory operatorFactory = BucketStreamWriteOperator.getFactory(conf); + int bucketNum = conf.getInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS); + String indexKeyFields = conf.getString(FlinkOptions.INDEX_KEY_FIELD); + BucketIndexPartitioner partitioner = new BucketIndexPartitioner<>(bucketNum, indexKeyFields); + return dataStream.partitionCustom(partitioner, HoodieRecord::getKey) + .transform("bucket_write", TypeInformation.of(Object.class), operatorFactory) + .uid("uid_bucket_write" + conf.getString(FlinkOptions.TABLE_NAME)) + .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); + } else { + WriteOperatorFactory operatorFactory = StreamWriteOperator.getFactory(conf); + return dataStream // Key-by record key, to avoid multiple subtasks write to a bucket at the same time .keyBy(HoodieRecord::getRecordKey) .transform( - "bucket_assigner", - TypeInformation.of(HoodieRecord.class), - new KeyedProcessOperator<>(new BucketAssignFunction<>(conf))) + "bucket_assigner", + TypeInformation.of(HoodieRecord.class), + new KeyedProcessOperator<>(new BucketAssignFunction<>(conf))) .uid("uid_bucket_assigner_" + conf.getString(FlinkOptions.TABLE_NAME)) .setParallelism(conf.getOptional(FlinkOptions.BUCKET_ASSIGN_TASKS).orElse(defaultParallelism)) // shuffle by fileId(bucket id) .keyBy(record -> record.getCurrentLocation().getFileId()) - .transform("hoodie_stream_write", TypeInformation.of(Object.class), operatorFactory) - .uid("uid_hoodie_stream_write" + conf.getString(FlinkOptions.TABLE_NAME)) + .transform("stream_write", TypeInformation.of(Object.class), operatorFactory) + .uid("uid_stream_write" + conf.getString(FlinkOptions.TABLE_NAME)) .setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); + } } + /** + * The compaction tasks pipeline. + * + *

The compaction plan operator monitors the new compaction plan on the timeline + * then distributes the sub-plans to the compaction tasks. The compaction task then + * handle over the metadata to commit task for compaction transaction commit. + * The whole pipeline looks like the following: + * + *

+   *                                           /=== | task1 | ===\
+   *      | plan generation | ===> re-balance                      | commit |
+   *                                           \=== | task2 | ===/
+   *
+   *      Note: both the compaction plan generation task and commission task are singleton.
+   * 
+ * + * @param conf The configuration + * @param dataStream The input data stream + * @return the compaction pipeline + */ public static DataStreamSink compact(Configuration conf, DataStream dataStream) { return dataStream.transform("compact_plan_generate", TypeInformation.of(CompactionPlanEvent.class), diff --git a/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java b/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java index 58c38ef56744e..02e0e253cf577 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java +++ b/hudi-flink/src/main/java/org/apache/hudi/source/IncrementalInputSplits.java @@ -250,22 +250,12 @@ private List getArchivedMetadata( InstantRange instantRange, HoodieTimeline commitTimeline, String tableName) { - if (instantRange == null || commitTimeline.isBeforeTimelineStarts(instantRange.getStartInstant())) { - // read the archived metadata if: - // 1. the start commit is 'earliest'; - // 2. the start instant is archived. - HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); + if (commitTimeline.isBeforeTimelineStarts(instantRange.getStartInstant())) { + // read the archived metadata if the start instant is archived. + HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(instantRange.getStartInstant()); HoodieTimeline archivedCompleteTimeline = archivedTimeline.getCommitsTimeline().filterCompletedInstants(); if (!archivedCompleteTimeline.empty()) { - final String endTs = archivedCompleteTimeline.lastInstant().get().getTimestamp(); Stream instantStream = archivedCompleteTimeline.getInstants(); - if (instantRange != null) { - archivedTimeline.loadInstantDetailsInMemory(instantRange.getStartInstant(), endTs); - instantStream = instantStream.filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), GREATER_THAN_OR_EQUALS, instantRange.getStartInstant())); - } else { - final String startTs = archivedCompleteTimeline.firstInstant().get().getTimestamp(); - archivedTimeline.loadInstantDetailsInMemory(startTs, endTs); - } return maySkipCompaction(instantStream) .map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, archivedTimeline)).collect(Collectors.toList()); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java b/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java index 013043384d3b5..c3f43422f1d1e 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java +++ b/hudi-flink/src/main/java/org/apache/hudi/source/StreamReadOperator.java @@ -21,6 +21,7 @@ import org.apache.hudi.table.format.mor.MergeOnReadInputFormat; import org.apache.hudi.table.format.mor.MergeOnReadInputSplit; +import org.apache.flink.api.common.operators.MailboxExecutor; import org.apache.flink.api.common.state.ListState; import org.apache.flink.api.common.state.ListStateDescriptor; import org.apache.flink.runtime.state.JavaSerializer; @@ -29,7 +30,6 @@ import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.streaming.api.operators.AbstractStreamOperator; import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; -import org.apache.flink.streaming.api.operators.MailboxExecutor; import org.apache.flink.streaming.api.operators.OneInputStreamOperator; import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; import org.apache.flink.streaming.api.operators.StreamOperator; @@ -54,7 +54,7 @@ * StreamReadMonitoringFunction}. Contrary to the {@link StreamReadMonitoringFunction} which has a parallelism of 1, * this operator can have multiple parallelism. * - *

As soon as an input split {@link MergeOnReadInputSplit} is received, it is put in a queue, + *

As soon as an input split {@link MergeOnReadInputSplit} is received, it is put into a queue, * the {@link MailboxExecutor} read the actual data of the split. * This architecture allows the separation of split reading from processing the checkpoint barriers, * thus removing any potential back-pressure. @@ -64,7 +64,7 @@ public class StreamReadOperator extends AbstractStreamOperator private static final Logger LOG = LoggerFactory.getLogger(StreamReadOperator.class); - private static final int MINI_BATCH_SIZE = 1000; + private static final int MINI_BATCH_SIZE = 2048; // It's the same thread that runs this operator and checkpoint actions. Use this executor to schedule only // splits for subsequent reading, so that a new checkpoint could be triggered without blocking a long time @@ -118,10 +118,10 @@ public void initializeState(StateInitializationContext context) throws Exception getOperatorConfig().getTimeCharacteristic(), getProcessingTimeService(), new Object(), // no actual locking needed - getContainingTask().getStreamStatusMaintainer(), output, getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), - -1); + -1, + true); // Enqueue to process the recovered input splits. enqueueProcessSplits(); @@ -205,8 +205,8 @@ public void processWatermark(Watermark mark) { } @Override - public void dispose() throws Exception { - super.dispose(); + public void close() throws Exception { + super.close(); if (format != null) { format.close(); @@ -218,8 +218,8 @@ public void dispose() throws Exception { } @Override - public void close() throws Exception { - super.close(); + public void finish() throws Exception { + super.finish(); output.close(); if (sourceContext != null) { sourceContext.emitWatermark(Watermark.MAX_WATERMARK); diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java index 5299551fccd38..7543382e19df4 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableFactory.java @@ -24,9 +24,11 @@ import org.apache.hudi.configuration.OptionsResolver; import org.apache.hudi.exception.HoodieValidationException; import org.apache.hudi.hive.MultiPartKeysValueExtractor; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.ComplexAvroKeyGenerator; import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.util.AvroSchemaConverter; import org.apache.hudi.util.DataTypeUtils; @@ -198,6 +200,11 @@ private static void setupHoodieKeyOptions(Configuration conf, CatalogTable table // the PARTITIONED BY syntax always has higher priority than option FlinkOptions#PARTITION_PATH_FIELD conf.setString(FlinkOptions.PARTITION_PATH_FIELD, String.join(",", partitionKeys)); } + // set index key for bucket index if not defined + if (conf.getString(FlinkOptions.INDEX_TYPE).equals(HoodieIndex.IndexType.BUCKET.name()) + && conf.getString(FlinkOptions.INDEX_KEY_FIELD).isEmpty()) { + conf.setString(FlinkOptions.INDEX_KEY_FIELD, conf.getString(FlinkOptions.RECORD_KEY_FIELD)); + } // tweak the key gen class if possible final String[] partitions = conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(","); final String[] pks = conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(","); @@ -238,22 +245,22 @@ public static void setupTimestampKeygenOptions(Configuration conf, DataType fiel int precision = DataTypeUtils.precision(fieldType.getLogicalType()); if (precision == 0) { // seconds - conf.setString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TYPE_FIELD_PROP, + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, TimestampBasedAvroKeyGenerator.TimestampType.UNIX_TIMESTAMP.name()); } else if (precision == 3) { // milliseconds - conf.setString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TYPE_FIELD_PROP, + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, TimestampBasedAvroKeyGenerator.TimestampType.EPOCHMILLISECONDS.name()); } String partitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_HOUR); - conf.setString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, partitionFormat); + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, partitionFormat); } else { - conf.setString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TYPE_FIELD_PROP, + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, TimestampBasedAvroKeyGenerator.TimestampType.DATE_STRING.name()); String partitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_DAY); - conf.setString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, partitionFormat); + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, partitionFormat); } - conf.setString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, "UTC"); + conf.setString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, "UTC"); } /** diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java index c1e6d0c28aa06..bbbc67985c8af 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSink.java @@ -84,7 +84,6 @@ public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { // default parallelism int parallelism = dataStream.getExecutionConfig().getParallelism(); DataStream pipeline; - // bootstrap final DataStream hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, parallelism, dataStream, context.isBounded(), overwrite); diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java index 259c2e40cd477..3efd1d5612f15 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java @@ -452,8 +452,8 @@ private Schema inferSchemaFromDdl() { @VisibleForTesting public Schema getTableAvroSchema() { try { - TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED)); - return schemaUtil.getTableAvroSchema(); + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); + return schemaResolver.getTableAvroSchema(); } catch (Throwable e) { // table exists but has no written data LOG.warn("Get table avro schema error, use schema from the DDL instead", e); diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java b/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java index fddaaba66e291..3317967006101 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java @@ -376,7 +376,7 @@ public List listPartitionsByFilter(ObjectPath tablePath, L @Override public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec catalogPartitionSpec) throws PartitionNotExistException, CatalogException { - return null; + throw new PartitionNotExistException(getName(), tablePath, catalogPartitionSpec); } @Override @@ -409,7 +409,7 @@ public List listFunctions(String databaseName) throws DatabaseNotExistEx @Override public CatalogFunction getFunction(ObjectPath functionPath) throws FunctionNotExistException, CatalogException { - return null; + throw new FunctionNotExistException(getName(), functionPath); } @Override diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java index 2c3318362b053..fa404cc2163ec 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/FormatUtils.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; import org.apache.hudi.common.util.DefaultSizeEstimator; +import org.apache.hudi.common.util.Functions; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer; @@ -50,6 +51,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.function.Function; /** * Utilities for format. @@ -193,8 +195,9 @@ public BoundedMemoryRecords( HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes(new JobConf(hadoopConf)), getParallelProducers(), Option.empty(), - x -> x, - new DefaultSizeEstimator<>()); + Function.identity(), + new DefaultSizeEstimator<>(), + Functions.noop()); // Consumer of this record reader this.iterator = this.executor.getQueue().iterator(); this.scanner = FormatUtils.unMergedLogScanner(split, logSchema, hadoopConf, diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetColumnarRowSplitReader.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetColumnarRowSplitReader.java index c615283c7c5ad..3cb491cfaf575 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetColumnarRowSplitReader.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetColumnarRowSplitReader.java @@ -18,11 +18,10 @@ package org.apache.hudi.table.format.cow; -import org.apache.hudi.table.format.cow.data.ColumnarRowData; -import org.apache.hudi.table.format.cow.vector.VectorizedColumnBatch; - import org.apache.flink.formats.parquet.vector.reader.ColumnReader; +import org.apache.flink.table.data.ColumnarRowData; import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.VectorizedColumnBatch; import org.apache.flink.table.data.vector.writable.WritableColumnVector; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.LogicalTypeRoot; diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java index 12d63aa974a5d..ca1408dcb7a5c 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/ParquetSplitReaderUtil.java @@ -22,7 +22,6 @@ import org.apache.hudi.table.format.cow.vector.HeapArrayVector; import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector; import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector; -import org.apache.hudi.table.format.cow.vector.VectorizedColumnBatch; import org.apache.hudi.table.format.cow.vector.reader.ArrayColumnReader; import org.apache.hudi.table.format.cow.vector.reader.MapColumnReader; import org.apache.hudi.table.format.cow.vector.reader.RowColumnReader; @@ -41,6 +40,7 @@ import org.apache.flink.table.data.DecimalData; import org.apache.flink.table.data.TimestampData; import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.VectorizedColumnBatch; import org.apache.flink.table.data.vector.heap.HeapBooleanVector; import org.apache.flink.table.data.vector.heap.HeapByteVector; import org.apache.flink.table.data.vector.heap.HeapBytesVector; diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarArrayData.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarArrayData.java deleted file mode 100644 index a16a4dd8d0142..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarArrayData.java +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.format.cow.data; - -import org.apache.hudi.table.format.cow.vector.MapColumnVector; - -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.data.binary.TypedSetters; -import org.apache.flink.table.data.vector.ArrayColumnVector; -import org.apache.flink.table.data.vector.BooleanColumnVector; -import org.apache.flink.table.data.vector.ByteColumnVector; -import org.apache.flink.table.data.vector.BytesColumnVector; -import org.apache.flink.table.data.vector.ColumnVector; -import org.apache.flink.table.data.vector.DecimalColumnVector; -import org.apache.flink.table.data.vector.DoubleColumnVector; -import org.apache.flink.table.data.vector.FloatColumnVector; -import org.apache.flink.table.data.vector.IntColumnVector; -import org.apache.flink.table.data.vector.LongColumnVector; -import org.apache.flink.table.data.vector.RowColumnVector; -import org.apache.flink.table.data.vector.ShortColumnVector; -import org.apache.flink.table.data.vector.TimestampColumnVector; - -import java.util.Arrays; - -/** - * Columnar array to support access to vector column data. - * - *

References {@code org.apache.flink.table.data.ColumnarArrayData} to include FLINK-15390. - */ -public final class ColumnarArrayData implements ArrayData, TypedSetters { - - private final ColumnVector data; - private final int offset; - private final int numElements; - - public ColumnarArrayData(ColumnVector data, int offset, int numElements) { - this.data = data; - this.offset = offset; - this.numElements = numElements; - } - - @Override - public int size() { - return numElements; - } - - @Override - public boolean isNullAt(int pos) { - return data.isNullAt(offset + pos); - } - - @Override - public void setNullAt(int pos) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public boolean getBoolean(int pos) { - return ((BooleanColumnVector) data).getBoolean(offset + pos); - } - - @Override - public byte getByte(int pos) { - return ((ByteColumnVector) data).getByte(offset + pos); - } - - @Override - public short getShort(int pos) { - return ((ShortColumnVector) data).getShort(offset + pos); - } - - @Override - public int getInt(int pos) { - return ((IntColumnVector) data).getInt(offset + pos); - } - - @Override - public long getLong(int pos) { - return ((LongColumnVector) data).getLong(offset + pos); - } - - @Override - public float getFloat(int pos) { - return ((FloatColumnVector) data).getFloat(offset + pos); - } - - @Override - public double getDouble(int pos) { - return ((DoubleColumnVector) data).getDouble(offset + pos); - } - - @Override - public StringData getString(int pos) { - BytesColumnVector.Bytes byteArray = getByteArray(pos); - return StringData.fromBytes(byteArray.data, byteArray.offset, byteArray.len); - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return ((DecimalColumnVector) data).getDecimal(offset + pos, precision, scale); - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return ((TimestampColumnVector) data).getTimestamp(offset + pos, precision); - } - - @Override - public RawValueData getRawValue(int pos) { - throw new UnsupportedOperationException("RawValueData is not supported."); - } - - @Override - public byte[] getBinary(int pos) { - BytesColumnVector.Bytes byteArray = getByteArray(pos); - if (byteArray.len == byteArray.data.length) { - return byteArray.data; - } else { - return Arrays.copyOfRange(byteArray.data, byteArray.offset, byteArray.len); - } - } - - @Override - public ArrayData getArray(int pos) { - return ((ArrayColumnVector) data).getArray(offset + pos); - } - - @Override - public MapData getMap(int pos) { - return ((MapColumnVector) data).getMap(offset + pos); - } - - @Override - public RowData getRow(int pos, int numFields) { - return ((RowColumnVector) data).getRow(offset + pos); - } - - @Override - public void setBoolean(int pos, boolean value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setByte(int pos, byte value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setShort(int pos, short value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setInt(int pos, int value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setLong(int pos, long value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setFloat(int pos, float value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setDouble(int pos, double value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setDecimal(int pos, DecimalData value, int precision) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setTimestamp(int pos, TimestampData value, int precision) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public boolean[] toBooleanArray() { - boolean[] res = new boolean[numElements]; - for (int i = 0; i < numElements; i++) { - res[i] = getBoolean(i); - } - return res; - } - - @Override - public byte[] toByteArray() { - byte[] res = new byte[numElements]; - for (int i = 0; i < numElements; i++) { - res[i] = getByte(i); - } - return res; - } - - @Override - public short[] toShortArray() { - short[] res = new short[numElements]; - for (int i = 0; i < numElements; i++) { - res[i] = getShort(i); - } - return res; - } - - @Override - public int[] toIntArray() { - int[] res = new int[numElements]; - for (int i = 0; i < numElements; i++) { - res[i] = getInt(i); - } - return res; - } - - @Override - public long[] toLongArray() { - long[] res = new long[numElements]; - for (int i = 0; i < numElements; i++) { - res[i] = getLong(i); - } - return res; - } - - @Override - public float[] toFloatArray() { - float[] res = new float[numElements]; - for (int i = 0; i < numElements; i++) { - res[i] = getFloat(i); - } - return res; - } - - @Override - public double[] toDoubleArray() { - double[] res = new double[numElements]; - for (int i = 0; i < numElements; i++) { - res[i] = getDouble(i); - } - return res; - } - - private BytesColumnVector.Bytes getByteArray(int pos) { - return ((BytesColumnVector) data).getBytes(offset + pos); - } -} - diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarMapData.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarMapData.java deleted file mode 100644 index 9792e87ec9365..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarMapData.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.format.cow.data; - -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.ColumnarArrayData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.vector.ColumnVector; - -/** - * Columnar map to support access to vector column data. - * - *

Referenced from flink 1.14.0 {@code org.apache.flink.table.data.ColumnarMapData}. - */ -public final class ColumnarMapData implements MapData { - - private final ColumnVector keyColumnVector; - private final ColumnVector valueColumnVector; - private final int offset; - private final int size; - - public ColumnarMapData( - ColumnVector keyColumnVector, - ColumnVector valueColumnVector, - int offset, - int size) { - this.keyColumnVector = keyColumnVector; - this.valueColumnVector = valueColumnVector; - this.offset = offset; - this.size = size; - } - - @Override - public int size() { - return size; - } - - @Override - public ArrayData keyArray() { - return new ColumnarArrayData(keyColumnVector, offset, size); - } - - @Override - public ArrayData valueArray() { - return new ColumnarArrayData(valueColumnVector, offset, size); - } - - @Override - public boolean equals(Object o) { - throw new UnsupportedOperationException( - "ColumnarMapData do not support equals, please compare fields one by one!"); - } - - @Override - public int hashCode() { - throw new UnsupportedOperationException( - "ColumnarMapData do not support hashCode, please hash fields one by one!"); - } -} diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarRowData.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarRowData.java deleted file mode 100644 index ebb4ca26fa87d..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/data/ColumnarRowData.java +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.format.cow.data; - -import org.apache.hudi.table.format.cow.vector.VectorizedColumnBatch; - -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RawValueData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.data.binary.TypedSetters; -import org.apache.flink.table.data.vector.BytesColumnVector; -import org.apache.flink.types.RowKind; - -/** - * Columnar row to support access to vector column data. - * It is a row view in {@link VectorizedColumnBatch}. - * - *

References {@code org.apache.flink.table.data.ColumnarRowData} to include FLINK-15390. - */ -public final class ColumnarRowData implements RowData, TypedSetters { - - private RowKind rowKind = RowKind.INSERT; - private VectorizedColumnBatch vectorizedColumnBatch; - private int rowId; - - public ColumnarRowData() { - } - - public ColumnarRowData(VectorizedColumnBatch vectorizedColumnBatch) { - this(vectorizedColumnBatch, 0); - } - - public ColumnarRowData(VectorizedColumnBatch vectorizedColumnBatch, int rowId) { - this.vectorizedColumnBatch = vectorizedColumnBatch; - this.rowId = rowId; - } - - public void setVectorizedColumnBatch(VectorizedColumnBatch vectorizedColumnBatch) { - this.vectorizedColumnBatch = vectorizedColumnBatch; - this.rowId = 0; - } - - public void setRowId(int rowId) { - this.rowId = rowId; - } - - @Override - public RowKind getRowKind() { - return rowKind; - } - - @Override - public void setRowKind(RowKind kind) { - this.rowKind = kind; - } - - @Override - public int getArity() { - return vectorizedColumnBatch.getArity(); - } - - @Override - public boolean isNullAt(int pos) { - return vectorizedColumnBatch.isNullAt(rowId, pos); - } - - @Override - public boolean getBoolean(int pos) { - return vectorizedColumnBatch.getBoolean(rowId, pos); - } - - @Override - public byte getByte(int pos) { - return vectorizedColumnBatch.getByte(rowId, pos); - } - - @Override - public short getShort(int pos) { - return vectorizedColumnBatch.getShort(rowId, pos); - } - - @Override - public int getInt(int pos) { - return vectorizedColumnBatch.getInt(rowId, pos); - } - - @Override - public long getLong(int pos) { - return vectorizedColumnBatch.getLong(rowId, pos); - } - - @Override - public float getFloat(int pos) { - return vectorizedColumnBatch.getFloat(rowId, pos); - } - - @Override - public double getDouble(int pos) { - return vectorizedColumnBatch.getDouble(rowId, pos); - } - - @Override - public StringData getString(int pos) { - BytesColumnVector.Bytes byteArray = vectorizedColumnBatch.getByteArray(rowId, pos); - return StringData.fromBytes(byteArray.data, byteArray.offset, byteArray.len); - } - - @Override - public DecimalData getDecimal(int pos, int precision, int scale) { - return vectorizedColumnBatch.getDecimal(rowId, pos, precision, scale); - } - - @Override - public TimestampData getTimestamp(int pos, int precision) { - return vectorizedColumnBatch.getTimestamp(rowId, pos, precision); - } - - @Override - public RawValueData getRawValue(int pos) { - throw new UnsupportedOperationException("RawValueData is not supported."); - } - - @Override - public byte[] getBinary(int pos) { - BytesColumnVector.Bytes byteArray = vectorizedColumnBatch.getByteArray(rowId, pos); - if (byteArray.len == byteArray.data.length) { - return byteArray.data; - } else { - byte[] ret = new byte[byteArray.len]; - System.arraycopy(byteArray.data, byteArray.offset, ret, 0, byteArray.len); - return ret; - } - } - - @Override - public RowData getRow(int pos, int numFields) { - return vectorizedColumnBatch.getRow(rowId, pos); - } - - @Override - public ArrayData getArray(int pos) { - return vectorizedColumnBatch.getArray(rowId, pos); - } - - @Override - public MapData getMap(int pos) { - return vectorizedColumnBatch.getMap(rowId, pos); - } - - @Override - public void setNullAt(int pos) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setBoolean(int pos, boolean value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setByte(int pos, byte value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setShort(int pos, short value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setInt(int pos, int value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setLong(int pos, long value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setFloat(int pos, float value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setDouble(int pos, double value) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setDecimal(int pos, DecimalData value, int precision) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public void setTimestamp(int pos, TimestampData value, int precision) { - throw new UnsupportedOperationException("Not support the operation!"); - } - - @Override - public boolean equals(Object o) { - throw new UnsupportedOperationException( - "ColumnarRowData do not support equals, please compare fields one by one!"); - } - - @Override - public int hashCode() { - throw new UnsupportedOperationException( - "ColumnarRowData do not support hashCode, please hash fields one by one!"); - } -} - diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java index f4c15b6a9b366..edd90714c87a7 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapArrayVector.java @@ -18,9 +18,8 @@ package org.apache.hudi.table.format.cow.vector; -import org.apache.hudi.table.format.cow.data.ColumnarArrayData; - import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.ColumnarArrayData; import org.apache.flink.table.data.vector.ArrayColumnVector; import org.apache.flink.table.data.vector.ColumnVector; import org.apache.flink.table.data.vector.heap.AbstractHeapVector; diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java index f05a2e73431d0..2b34a02f116b3 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapMapColumnVector.java @@ -18,10 +18,10 @@ package org.apache.hudi.table.format.cow.vector; -import org.apache.hudi.table.format.cow.data.ColumnarMapData; - +import org.apache.flink.table.data.ColumnarMapData; import org.apache.flink.table.data.MapData; import org.apache.flink.table.data.vector.ColumnVector; +import org.apache.flink.table.data.vector.MapColumnVector; import org.apache.flink.table.data.vector.heap.AbstractHeapVector; import org.apache.flink.table.data.vector.writable.WritableColumnVector; diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java index ad05a612c7bde..0193e6cbb1d22 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/HeapRowColumnVector.java @@ -18,8 +18,9 @@ package org.apache.hudi.table.format.cow.vector; -import org.apache.hudi.table.format.cow.data.ColumnarRowData; - +import org.apache.flink.table.data.ColumnarRowData; +import org.apache.flink.table.data.vector.RowColumnVector; +import org.apache.flink.table.data.vector.VectorizedColumnBatch; import org.apache.flink.table.data.vector.heap.AbstractHeapVector; import org.apache.flink.table.data.vector.writable.WritableColumnVector; diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/VectorizedColumnBatch.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/VectorizedColumnBatch.java deleted file mode 100644 index 9eee55d1eeae6..0000000000000 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/VectorizedColumnBatch.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.format.cow.vector; - -import org.apache.flink.table.data.ArrayData; -import org.apache.flink.table.data.DecimalData; -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.TimestampData; -import org.apache.flink.table.data.vector.ArrayColumnVector; -import org.apache.flink.table.data.vector.BooleanColumnVector; -import org.apache.flink.table.data.vector.ByteColumnVector; -import org.apache.flink.table.data.vector.BytesColumnVector; -import org.apache.flink.table.data.vector.ColumnVector; -import org.apache.flink.table.data.vector.DecimalColumnVector; -import org.apache.flink.table.data.vector.DoubleColumnVector; -import org.apache.flink.table.data.vector.FloatColumnVector; -import org.apache.flink.table.data.vector.IntColumnVector; -import org.apache.flink.table.data.vector.LongColumnVector; -import org.apache.flink.table.data.vector.ShortColumnVector; -import org.apache.flink.table.data.vector.TimestampColumnVector; - -import java.io.Serializable; -import java.nio.charset.StandardCharsets; - -/** - * A VectorizedColumnBatch is a set of rows, organized with each column as a vector. It is the unit - * of query execution, organized to minimize the cost per row. - * - *

{@code VectorizedColumnBatch}s are influenced by Apache Hive VectorizedRowBatch. - * - *

References {@code org.apache.flink.table.data.vector.VectorizedColumnBatch} to include FLINK-15390. - */ -public class VectorizedColumnBatch implements Serializable { - private static final long serialVersionUID = 1L; - - /** - * This number is carefully chosen to minimize overhead and typically allows one - * VectorizedColumnBatch to fit in cache. - */ - public static final int DEFAULT_SIZE = 2048; - - private int numRows; - public final ColumnVector[] columns; - - public VectorizedColumnBatch(ColumnVector[] vectors) { - this.columns = vectors; - } - - public void setNumRows(int numRows) { - this.numRows = numRows; - } - - public int getNumRows() { - return numRows; - } - - public int getArity() { - return columns.length; - } - - public boolean isNullAt(int rowId, int colId) { - return columns[colId].isNullAt(rowId); - } - - public boolean getBoolean(int rowId, int colId) { - return ((BooleanColumnVector) columns[colId]).getBoolean(rowId); - } - - public byte getByte(int rowId, int colId) { - return ((ByteColumnVector) columns[colId]).getByte(rowId); - } - - public short getShort(int rowId, int colId) { - return ((ShortColumnVector) columns[colId]).getShort(rowId); - } - - public int getInt(int rowId, int colId) { - return ((IntColumnVector) columns[colId]).getInt(rowId); - } - - public long getLong(int rowId, int colId) { - return ((LongColumnVector) columns[colId]).getLong(rowId); - } - - public float getFloat(int rowId, int colId) { - return ((FloatColumnVector) columns[colId]).getFloat(rowId); - } - - public double getDouble(int rowId, int colId) { - return ((DoubleColumnVector) columns[colId]).getDouble(rowId); - } - - public BytesColumnVector.Bytes getByteArray(int rowId, int colId) { - return ((BytesColumnVector) columns[colId]).getBytes(rowId); - } - - private byte[] getBytes(int rowId, int colId) { - BytesColumnVector.Bytes byteArray = getByteArray(rowId, colId); - if (byteArray.len == byteArray.data.length) { - return byteArray.data; - } else { - return byteArray.getBytes(); - } - } - - public String getString(int rowId, int colId) { - BytesColumnVector.Bytes byteArray = getByteArray(rowId, colId); - return new String(byteArray.data, byteArray.offset, byteArray.len, StandardCharsets.UTF_8); - } - - public DecimalData getDecimal(int rowId, int colId, int precision, int scale) { - return ((DecimalColumnVector) (columns[colId])).getDecimal(rowId, precision, scale); - } - - public TimestampData getTimestamp(int rowId, int colId, int precision) { - return ((TimestampColumnVector) (columns[colId])).getTimestamp(rowId, precision); - } - - public ArrayData getArray(int rowId, int colId) { - return ((ArrayColumnVector) columns[colId]).getArray(rowId); - } - - public RowData getRow(int rowId, int colId) { - return ((RowColumnVector) columns[colId]).getRow(rowId); - } - - public MapData getMap(int rowId, int colId) { - return ((MapColumnVector) columns[colId]).getMap(rowId); - } -} - diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java b/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java index 7a72bca0582fd..4404e15eaaccf 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java +++ b/hudi-flink/src/main/java/org/apache/hudi/table/format/mor/MergeOnReadInputFormat.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.format.mor; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieOperation; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; @@ -340,7 +341,7 @@ public boolean hasNext() { while (logRecordsKeyIterator.hasNext()) { String curAvroKey = logRecordsKeyIterator.next(); Option curAvroRecord = null; - final HoodieRecord hoodieRecord = scanner.getRecords().get(curAvroKey); + final HoodieAvroRecord hoodieRecord = (HoodieAvroRecord) scanner.getRecords().get(curAvroKey); try { curAvroRecord = hoodieRecord.getData().getInsertValue(tableSchema); } catch (IOException e) { @@ -412,7 +413,7 @@ private ClosableIterator getUnMergedLogFileIterator(MergeOnReadInputSpl public boolean hasNext() { while (recordsIterator.hasNext()) { Option curAvroRecord = null; - final HoodieRecord hoodieRecord = recordsIterator.next(); + final HoodieAvroRecord hoodieRecord = (HoodieAvroRecord) recordsIterator.next(); try { curAvroRecord = hoodieRecord.getData().getInsertValue(tableSchema); } catch (IOException e) { @@ -725,7 +726,7 @@ public boolean reachedEnd() throws IOException { } private Option getInsertValue(String curKey) throws IOException { - final HoodieRecord record = scanner.getRecords().get(curKey); + final HoodieAvroRecord record = (HoodieAvroRecord) scanner.getRecords().get(curKey); if (!emitDelete && HoodieOperation.isDelete(record.getOperation())) { return Option.empty(); } @@ -750,7 +751,7 @@ public void close() throws IOException { private Option mergeRowWithLog( RowData curRow, String curKey) throws IOException { - final HoodieRecord record = scanner.getRecords().get(curKey); + final HoodieAvroRecord record = (HoodieAvroRecord) scanner.getRecords().get(curKey); GenericRecord historyAvroRecord = (GenericRecord) rowDataToAvroConverter.convert(tableSchema, curRow); return record.getData().combineAndGetUpdateValue(historyAvroRecord, tableSchema); } diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java b/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java index d04937bf7d66f..74629f9b0942f 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java +++ b/hudi-flink/src/main/java/org/apache/hudi/util/CompactionUtil.java @@ -26,6 +26,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.configuration.FlinkOptions; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.sink.compact.FlinkCompactionConfig; @@ -106,6 +107,18 @@ public static void setAvroSchema(Configuration conf, HoodieTableMetaClient metaC conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, tableAvroSchema.toString()); } + /** + * Sets up the avro schema string into the HoodieWriteConfig {@code HoodieWriteConfig} + * through reading from the hoodie table metadata. + * + * @param writeConfig The HoodieWriteConfig + */ + public static void setAvroSchema(HoodieWriteConfig writeConfig, HoodieTableMetaClient metaClient) throws Exception { + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + Schema tableAvroSchema = tableSchemaResolver.getTableAvroSchema(false); + writeConfig.setSchema(tableAvroSchema.toString()); + } + /** * Infers the changelog mode based on the data file schema(including metadata fields). * diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java b/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java index 98df0bbcfd868..45d23f2ff4ea1 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java +++ b/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java @@ -406,7 +406,9 @@ public static HoodieFlinkWriteClient createWriteClient(Configuration conf) throw FileSystemViewStorageConfig rebuilt = FileSystemViewStorageConfig.newBuilder() .withStorageType(viewStorageConfig.getStorageType()) .withRemoteServerHost(viewStorageConfig.getRemoteViewServerHost()) - .withRemoteServerPort(viewStorageConfig.getRemoteViewServerPort()).build(); + .withRemoteServerPort(viewStorageConfig.getRemoteViewServerPort()) + .withRemoteTimelineClientTimeoutSecs(viewStorageConfig.getRemoteTimelineClientTimeoutSecs()) + .build(); ViewStorageProperties.createProperties(conf.getString(FlinkOptions.PATH), rebuilt); return writeClient; } diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/StreamWriteITCase.java b/hudi-flink/src/test/java/org/apache/hudi/sink/StreamWriteITCase.java index 028c058eedafc..eaa2d6ced67d9 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/StreamWriteITCase.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/StreamWriteITCase.java @@ -51,6 +51,8 @@ import org.apache.flink.util.TestLogger; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import java.io.File; import java.nio.charset.StandardCharsets; @@ -129,10 +131,14 @@ public void testWriteToHoodieWithoutTransformer() throws Exception { testWriteToHoodie(null, EXPECTED); } - @Test - public void testMergeOnReadWriteWithCompaction() throws Exception { + @ParameterizedTest + @ValueSource(strings = {"BUCKET", "FLINK_STATE"}) + public void testMergeOnReadWriteWithCompaction(String indexType) throws Exception { int parallelism = 4; Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath()); + conf.setString(FlinkOptions.INDEX_TYPE, indexType); + conf.setInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS, 4); + conf.setString(FlinkOptions.INDEX_KEY_FIELD, "id"); conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1); conf.setString(FlinkOptions.TABLE_TYPE, HoodieTableType.MERGE_ON_READ.name()); StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(); @@ -172,7 +178,7 @@ public void testMergeOnReadWriteWithCompaction() throws Exception { DataStream pipeline = Pipelines.hoodieStreamWrite(conf, parallelism, hoodieRecordDataStream); Pipelines.clean(conf, pipeline); Pipelines.compact(conf, pipeline); - JobClient client = execEnv.executeAsync(execEnv.getStreamGraph(conf.getString(FlinkOptions.TABLE_NAME))); + JobClient client = execEnv.executeAsync(execEnv.getStreamGraph()); if (client.getJobStatus().get() != JobStatus.FAILED) { try { TimeUnit.SECONDS.sleep(20); // wait long enough for the compaction to finish @@ -229,7 +235,7 @@ private void testWriteToHoodie( DataStream pipeline = Pipelines.hoodieStreamWrite(conf, parallelism, hoodieRecordDataStream); execEnv.addOperator(pipeline.getTransformation()); - JobClient client = execEnv.executeAsync(execEnv.getStreamGraph(conf.getString(FlinkOptions.TABLE_NAME))); + JobClient client = execEnv.executeAsync(conf.getString(FlinkOptions.TABLE_NAME)); // wait for the streaming job to finish client.getJobExecutionResult().get(); diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java index a91f45263ff25..35523a8fb426c 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java @@ -397,6 +397,7 @@ public void testWriteExactlyOnce() throws Exception { @Test public void testReuseEmbeddedServer() throws IOException { + conf.setInteger("hoodie.filesystem.view.remote.timeout.secs", 500); HoodieFlinkWriteClient writeClient = StreamerUtil.createWriteClient(conf); FileSystemViewStorageConfig viewStorageConfig = writeClient.getConfig().getViewStorageConfig(); @@ -406,6 +407,7 @@ public void testReuseEmbeddedServer() throws IOException { writeClient = StreamerUtil.createWriteClient(conf); assertSame(writeClient.getConfig().getViewStorageConfig().getStorageType(), FileSystemViewStorageType.REMOTE_FIRST); assertEquals(viewStorageConfig.getRemoteViewServerPort(), writeClient.getConfig().getViewStorageConfig().getRemoteViewServerPort()); + assertEquals(viewStorageConfig.getRemoteTimelineClientTimeoutSecs(), 500); } // ------------------------------------------------------------------------- diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CollectorOutput.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CollectorOutput.java index 3da21e6eb9887..c386e6287b8cd 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CollectorOutput.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CollectorOutput.java @@ -23,6 +23,7 @@ import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker; import org.apache.flink.streaming.runtime.streamrecord.StreamElement; import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.watermarkstatus.WatermarkStatus; import org.apache.flink.util.InstantiationUtil; import org.apache.flink.util.OutputTag; @@ -49,6 +50,11 @@ public void emitWatermark(Watermark mark) { list.add(mark); } + @Override + public void emitWatermarkStatus(WatermarkStatus watermarkStatus) { + + } + @Override public void emitLatencyMarker(LatencyMarker latencyMarker) { list.add(latencyMarker); diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CompactFunctionWrapper.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CompactFunctionWrapper.java index fe2ddad18955f..e703515de3b7f 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CompactFunctionWrapper.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/CompactFunctionWrapper.java @@ -38,6 +38,7 @@ import org.apache.flink.streaming.api.watermark.Watermark; import org.apache.flink.streaming.runtime.streamrecord.LatencyMarker; import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.watermarkstatus.WatermarkStatus; import org.apache.flink.util.Collector; import org.apache.flink.util.OutputTag; @@ -102,6 +103,11 @@ public void emitWatermark(Watermark watermark) { } + @Override + public void emitWatermarkStatus(WatermarkStatus watermarkStatus) { + + } + @Override public void collect(OutputTag outputTag, StreamRecord streamRecord) { diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStateInitializationContext.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStateInitializationContext.java index dd89f71110e82..c582e9553b30e 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStateInitializationContext.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStateInitializationContext.java @@ -23,6 +23,8 @@ import org.apache.flink.runtime.state.StateInitializationContext; import org.apache.flink.runtime.state.StatePartitionStreamProvider; +import java.util.OptionalLong; + /** * A {@link FunctionInitializationContext} for testing purpose. */ @@ -39,6 +41,11 @@ public boolean isRestored() { return false; } + @Override + public OptionalLong getRestoredCheckpointId() { + return OptionalLong.empty(); + } + @Override public MockOperatorStateStore getOperatorStateStore() { return operatorStateStore; diff --git a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStreamingRuntimeContext.java b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStreamingRuntimeContext.java index 14305da3db781..8a66f1dce011a 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStreamingRuntimeContext.java +++ b/hudi-flink/src/test/java/org/apache/hudi/sink/utils/MockStreamingRuntimeContext.java @@ -19,7 +19,7 @@ import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.state.KeyedStateStore; -import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.metrics.groups.OperatorMetricGroup; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; import org.apache.flink.runtime.jobgraph.OperatorID; import org.apache.flink.runtime.memory.MemoryManager; @@ -69,8 +69,8 @@ public MockStreamingRuntimeContext( } @Override - public MetricGroup getMetricGroup() { - return new UnregisteredMetricsGroup(); + public OperatorMetricGroup getMetricGroup() { + return UnregisteredMetricsGroup.createOperatorMetricGroup(); } @Override diff --git a/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java b/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java index 911c68511ccee..db45a75977f5e 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java +++ b/hudi-flink/src/test/java/org/apache/hudi/source/TestStreamReadOperator.java @@ -245,10 +245,10 @@ private OneInputStreamOperatorTestHarness create final List partitionKeys = Collections.singletonList("partition"); // This input format is used to opening the emitted split. - TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); final Schema tableAvroSchema; try { - tableAvroSchema = schemaUtil.getTableAvroSchema(); + tableAvroSchema = schemaResolver.getTableAvroSchema(); } catch (Exception e) { throw new HoodieException("Get table avro schema error", e); } diff --git a/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java b/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java index f1ca68e632771..7c9b0bb6a3cc8 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java +++ b/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java @@ -31,10 +31,12 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.core.execution.JobClient; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.table.api.TableResult; import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.table.api.config.ExecutionConfigOptions; import org.apache.flink.table.api.internal.TableEnvironmentImpl; import org.apache.flink.table.catalog.ObjectPath; @@ -62,6 +64,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.utils.TestConfigurations.catalog; import static org.apache.hudi.utils.TestConfigurations.sql; import static org.apache.hudi.utils.TestData.assertRowsEquals; import static org.junit.jupiter.api.Assertions.assertThrows; @@ -86,8 +89,24 @@ void beforeEach() { execConf.setString("restart-strategy", "fixed-delay"); execConf.setString("restart-strategy.fixed-delay.attempts", "0"); + Configuration conf = new Configuration(); + // for batch upsert use cases: current suggestion is to disable these 2 options, + // from 1.14, flink runtime execution mode has switched from streaming + // to batch for batch execution mode(before that, both streaming and batch use streaming execution mode), + // current batch execution mode has these limitations: + // + // 1. the keyed stream default to always sort the inputs by key; + // 2. the batch state-backend requires the inputs sort by state key + // + // For our hudi batch pipeline upsert case, we rely on the consuming sequence for index records and data records, + // the index records must be loaded first before data records for BucketAssignFunction to keep upsert semantics correct, + // so we suggest disabling these 2 options to use streaming state-backend for batch execution mode + // to keep the strategy before 1.14. + conf.setBoolean("execution.sorted-inputs.enabled", false); + conf.setBoolean("execution.batch-state-backend.enabled", false); + StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(conf); settings = EnvironmentSettings.newInstance().inBatchMode().build(); - batchTableEnv = TableEnvironmentImpl.create(settings); + batchTableEnv = StreamTableEnvironment.create(execEnv, settings); batchTableEnv.getConfig().getConfiguration() .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1); } @@ -861,7 +880,7 @@ void testWriteAndReadDebeziumJson(ExecMode execMode) throws Exception { .getContextClassLoader().getResource("debezium_json.data")).toString(); String sourceDDL = "" + "CREATE TABLE debezium_source(\n" - + " id INT NOT NULL,\n" + + " id INT NOT NULL PRIMARY KEY NOT ENFORCED,\n" + " ts BIGINT,\n" + " name STRING,\n" + " description STRING,\n" @@ -1153,6 +1172,7 @@ void testParquetComplexNestedRowTypes(String operation) { String hoodieTableDDL = sql("t1") .field("f_int int") .field("f_array array") + .field("int_array array") .field("f_map map") .field("f_row row(f_nested_array array, f_nested_row row(f_row_f0 int, f_row_f1 varchar(10)))") .pkField("f_int") @@ -1167,12 +1187,53 @@ void testParquetComplexNestedRowTypes(String operation) { List result = CollectionUtil.iterableToList( () -> tableEnv.sqlQuery("select * from t1").execute().collect()); final String expected = "[" - + "+I[1, [abc1, def1], {abc1=1, def1=3}, +I[[abc1, def1], +I[1, abc1]]], " - + "+I[2, [abc2, def2], {def2=3, abc2=1}, +I[[abc2, def2], +I[2, abc2]]], " - + "+I[3, [abc3, def3], {def3=3, abc3=1}, +I[[abc3, def3], +I[3, abc3]]]]"; + + "+I[1, [abc1, def1], [1, 1], {abc1=1, def1=3}, +I[[abc1, def1], +I[1, abc1]]], " + + "+I[2, [abc2, def2], [2, 2], {def2=3, abc2=1}, +I[[abc2, def2], +I[2, abc2]]], " + + "+I[3, [abc3, def3], [3, 3], {def3=3, abc3=1}, +I[[abc3, def3], +I[3, abc3]]]]"; assertRowsEquals(result, expected); } + @ParameterizedTest + @ValueSource(strings = {"insert", "upsert", "bulk_insert"}) + void testBuiltinFunctionWithCatalog(String operation) { + TableEnvironment tableEnv = streamTableEnv; + + String hudiCatalogDDL = catalog("hudi_" + operation) + .catalogPath(tempFile.getAbsolutePath()) + .end(); + + tableEnv.executeSql(hudiCatalogDDL); + tableEnv.executeSql("use catalog " + ("hudi_" + operation)); + + String dbName = "hudi"; + tableEnv.executeSql("create database " + dbName); + tableEnv.executeSql("use " + dbName); + + String hoodieTableDDL = sql("t1") + .field("f_int int") + .field("f_date DATE") + .pkField("f_int") + .partitionField("f_int") + .option(FlinkOptions.PATH, tempFile.getAbsolutePath() + "/" + dbName + "/" + operation) + .option(FlinkOptions.OPERATION, operation) + .end(); + tableEnv.executeSql(hoodieTableDDL); + + String insertSql = "insert into t1 values (1, TO_DATE('2022-02-02')), (2, DATE '2022-02-02')"; + execInsertSql(tableEnv, insertSql); + + List result = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1").execute().collect()); + final String expected = "[" + + "+I[1, 2022-02-02], " + + "+I[2, 2022-02-02]]"; + assertRowsEquals(result, expected); + + List partitionResult = CollectionUtil.iterableToList( + () -> tableEnv.sqlQuery("select * from t1 where f_int = 1").execute().collect()); + assertRowsEquals(partitionResult, "[+I[1, 2022-02-02]]"); + } + // ------------------------------------------------------------------------- // Utilities // ------------------------------------------------------------------------- diff --git a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java index cbdffe360fd2b..a76e00816189a 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java +++ b/hudi-flink/src/test/java/org/apache/hudi/table/TestHoodieTableFactory.java @@ -24,9 +24,11 @@ import org.apache.hudi.exception.HoodieValidationException; import org.apache.hudi.hive.MultiPartKeysValueExtractor; import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; +import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.keygen.ComplexAvroKeyGenerator; import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator; import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.util.StreamerUtil; import org.apache.hudi.utils.SchemaBuilder; import org.apache.hudi.utils.TestConfigurations; @@ -346,6 +348,16 @@ void testSetupHoodieKeyOptionsForSink() { final Configuration conf3 = tableSink3.getConf(); assertThat(conf3.get(FlinkOptions.RECORD_KEY_FIELD), is("f0,f1")); assertThat(conf3.get(FlinkOptions.KEYGEN_CLASS_NAME), is(NonpartitionedAvroKeyGenerator.class.getName())); + + // definition of bucket index + this.conf.setString(FlinkOptions.INDEX_TYPE, HoodieIndex.IndexType.BUCKET.name()); + final MockContext sinkContext4 = MockContext.getInstance(this.conf, schema2, ""); + final HoodieTableSink tableSink4 = (HoodieTableSink) new HoodieTableFactory().createDynamicTableSink(sinkContext4); + final Configuration conf4 = tableSink4.getConf(); + assertThat(conf4.get(FlinkOptions.RECORD_KEY_FIELD), is("f0,f1")); + assertThat(conf4.get(FlinkOptions.INDEX_KEY_FIELD), is("f0,f1")); + assertThat(conf4.get(FlinkOptions.INDEX_TYPE), is(HoodieIndex.IndexType.BUCKET.name())); + assertThat(conf4.get(FlinkOptions.KEYGEN_CLASS_NAME), is(NonpartitionedAvroKeyGenerator.class.getName())); } @Test @@ -419,11 +431,11 @@ void testSetupTimestampBasedKeyGenForSink() { final Configuration conf1 = tableSource1.getConf(); assertThat(conf1.get(FlinkOptions.RECORD_KEY_FIELD), is("f0")); assertThat(conf1.get(FlinkOptions.KEYGEN_CLASS_NAME), is(TimestampBasedAvroKeyGenerator.class.getName())); - assertThat(conf1.getString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TYPE_FIELD_PROP, "dummy"), + assertThat(conf1.getString(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, "dummy"), is("EPOCHMILLISECONDS")); - assertThat(conf1.getString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "dummy"), + assertThat(conf1.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "dummy"), is(FlinkOptions.PARTITION_FORMAT_HOUR)); - assertThat(conf1.getString(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, "dummy"), + assertThat(conf1.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, "dummy"), is("UTC")); } diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java b/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java index 46cad3e826d3e..d1b6e04a1835d 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/TestConfigurations.java @@ -64,12 +64,12 @@ private TestConfigurations() { .map(RowType.RowField::asSummaryString).collect(Collectors.toList()); public static final DataType ROW_DATA_TYPE_WIDER = DataTypes.ROW( - DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key - DataTypes.FIELD("name", DataTypes.VARCHAR(10)), - DataTypes.FIELD("age", DataTypes.INT()), - DataTypes.FIELD("salary", DataTypes.DOUBLE()), - DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field - DataTypes.FIELD("partition", DataTypes.VARCHAR(10))) + DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key + DataTypes.FIELD("name", DataTypes.VARCHAR(10)), + DataTypes.FIELD("age", DataTypes.INT()), + DataTypes.FIELD("salary", DataTypes.DOUBLE()), + DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field + DataTypes.FIELD("partition", DataTypes.VARCHAR(10))) .notNull(); public static final RowType ROW_TYPE_WIDER = (RowType) ROW_DATA_TYPE_WIDER.getLogicalType(); @@ -112,6 +112,15 @@ public static String getCreateHoodieTableDDL( return builder.toString(); } + public static String getCreateHudiCatalogDDL(final String catalogName, final String catalogPath) { + StringBuilder builder = new StringBuilder(); + builder.append("create catalog ").append(catalogName).append(" with (\n"); + builder.append(" 'type' = 'hudi',\n" + + " 'catalog.path' = '").append(catalogPath).append("'"); + builder.append("\n)"); + return builder.toString(); + } + public static String getFileSourceDDL(String tableName) { return getFileSourceDDL(tableName, "test_source.data"); } @@ -222,6 +231,10 @@ public static Sql sql(String tableName) { return new Sql(tableName); } + public static Catalog catalog(String catalogName) { + return new Catalog(catalogName); + } + // ------------------------------------------------------------------------- // Utilities // ------------------------------------------------------------------------- @@ -285,4 +298,22 @@ public String end() { this.withPartition, this.pkField, this.partitionField); } } + + public static class Catalog { + private final String catalogName; + private String catalogPath = "."; + + public Catalog(String catalogName) { + this.catalogName = catalogName; + } + + public Catalog catalogPath(String catalogPath) { + this.catalogPath = catalogPath; + return this; + } + + public String end() { + return TestConfigurations.getCreateHudiCatalogDDL(catalogName, catalogPath); + } + } } diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java b/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java index 595d142b7cc0d..1695e4e7149a9 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java @@ -58,7 +58,7 @@ private TestSQL() { + "(3, array['abc3', 'def3'], map['abc3', 1, 'def3', 3], row(3, 'abc3'))"; public static final String COMPLEX_NESTED_ROW_TYPE_INSERT_T1 = "insert into t1 values\n" - + "(1, array['abc1', 'def1'], map['abc1', 1, 'def1', 3], row(array['abc1', 'def1'], row(1, 'abc1'))),\n" - + "(2, array['abc2', 'def2'], map['abc2', 1, 'def2', 3], row(array['abc2', 'def2'], row(2, 'abc2'))),\n" - + "(3, array['abc3', 'def3'], map['abc3', 1, 'def3', 3], row(array['abc3', 'def3'], row(3, 'abc3')))"; + + "(1, array['abc1', 'def1'], array[1, 1], map['abc1', 1, 'def1', 3], row(array['abc1', 'def1'], row(1, 'abc1'))),\n" + + "(2, array['abc2', 'def2'], array[2, 2], map['abc2', 1, 'def2', 3], row(array['abc2', 'def2'], row(2, 'abc2'))),\n" + + "(3, array['abc3', 'def3'], array[3, 3], map['abc3', 1, 'def3', 3], row(array['abc3', 'def3'], row(3, 'abc3')))"; } diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/factory/ContinuousFileSourceFactory.java b/hudi-flink/src/test/java/org/apache/hudi/utils/factory/ContinuousFileSourceFactory.java index 92d9c55723518..31b3ad5c7669d 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/factory/ContinuousFileSourceFactory.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/factory/ContinuousFileSourceFactory.java @@ -53,7 +53,7 @@ public DynamicTableSource createDynamicTableSource(Context context) { Configuration conf = (Configuration) helper.getOptions(); Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() -> new ValidationException("Option [path] should be not empty."))); - return new ContinuousFileSource(context.getCatalogTable().getSchema(), path, conf); + return new ContinuousFileSource(context.getCatalogTable().getResolvedSchema(), path, conf); } @Override diff --git a/hudi-flink/src/test/java/org/apache/hudi/utils/source/ContinuousFileSource.java b/hudi-flink/src/test/java/org/apache/hudi/utils/source/ContinuousFileSource.java index a44061076f581..d38aad60c3452 100644 --- a/hudi-flink/src/test/java/org/apache/hudi/utils/source/ContinuousFileSource.java +++ b/hudi-flink/src/test/java/org/apache/hudi/utils/source/ContinuousFileSource.java @@ -18,15 +18,15 @@ package org.apache.hudi.utils.source; +import org.apache.flink.api.common.state.CheckpointListener; import org.apache.flink.configuration.Configuration; import org.apache.flink.core.fs.Path; import org.apache.flink.formats.common.TimestampFormat; import org.apache.flink.formats.json.JsonRowDataDeserializationSchema; -import org.apache.flink.runtime.state.CheckpointListener; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.SourceFunction; -import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.catalog.ResolvedSchema; import org.apache.flink.table.connector.ChangelogMode; import org.apache.flink.table.connector.source.DataStreamScanProvider; import org.apache.flink.table.connector.source.DynamicTableSource; @@ -59,12 +59,12 @@ */ public class ContinuousFileSource implements ScanTableSource { - private final TableSchema tableSchema; + private final ResolvedSchema tableSchema; private final Path path; private final Configuration conf; public ContinuousFileSource( - TableSchema tableSchema, + ResolvedSchema tableSchema, Path path, Configuration conf) { this.tableSchema = tableSchema; @@ -83,7 +83,7 @@ public boolean isBounded() { @Override public DataStream produceDataStream(StreamExecutionEnvironment execEnv) { - final RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); + final RowType rowType = (RowType) tableSchema.toSourceRowDataType().getLogicalType(); JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema( rowType, InternalTypeInfo.of(rowType), @@ -178,7 +178,7 @@ private void loadDataBuffer() { } @Override - public void notifyCheckpointComplete(long l) throws Exception { + public void notifyCheckpointComplete(long l) { this.currentCP.incrementAndGet(); } } diff --git a/hudi-hadoop-mr/pom.xml b/hudi-hadoop-mr/pom.xml index 57fbdb7b8e267..bf87bfaa36a81 100644 --- a/hudi-hadoop-mr/pom.xml +++ b/hudi-hadoop-mr/pom.xml @@ -30,13 +30,6 @@ - - - org.scala-lang - scala-library - ${scala.version} - - org.apache.hudi diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BaseFileWithLogsSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BaseFileWithLogsSplit.java deleted file mode 100644 index c9afa9119c0c5..0000000000000 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BaseFileWithLogsSplit.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.hadoop; - -import org.apache.hudi.common.model.HoodieLogFile; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileSplit; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -/** - * Encode additional information in split to track matching log file and base files. - * Hence, this class tracks a log/base file split. - */ -public class BaseFileWithLogsSplit extends FileSplit { - // a flag to mark this split is produced by incremental query or not. - private boolean belongToIncrementalSplit = false; - // the log file paths of this split. - private List deltaLogFiles = new ArrayList<>(); - // max commit time of current split. - private String maxCommitTime = ""; - // the basePath of current hoodie table. - private String basePath = ""; - // the base file belong to this split. - private String baseFilePath = ""; - - public BaseFileWithLogsSplit(Path file, long start, long length, String[] hosts) { - super(file, start, length, hosts); - } - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - out.writeBoolean(belongToIncrementalSplit); - Text.writeString(out, maxCommitTime); - Text.writeString(out, basePath); - Text.writeString(out, baseFilePath); - out.writeInt(deltaLogFiles.size()); - for (HoodieLogFile logFile : deltaLogFiles) { - Text.writeString(out, logFile.getPath().toString()); - out.writeLong(logFile.getFileSize()); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - belongToIncrementalSplit = in.readBoolean(); - maxCommitTime = Text.readString(in); - basePath = Text.readString(in); - baseFilePath = Text.readString(in); - int deltaLogSize = in.readInt(); - List tempDeltaLogs = new ArrayList<>(); - for (int i = 0; i < deltaLogSize; i++) { - String logPath = Text.readString(in); - long logFileSize = in.readLong(); - tempDeltaLogs.add(new HoodieLogFile(new Path(logPath), logFileSize)); - } - deltaLogFiles = tempDeltaLogs; - } - - public boolean getBelongToIncrementalSplit() { - return belongToIncrementalSplit; - } - - public void setBelongToIncrementalSplit(boolean belongToIncrementalSplit) { - this.belongToIncrementalSplit = belongToIncrementalSplit; - } - - public List getDeltaLogFiles() { - return deltaLogFiles; - } - - public void setDeltaLogFiles(List deltaLogFiles) { - this.deltaLogFiles = deltaLogFiles; - } - - public String getMaxCommitTime() { - return maxCommitTime; - } - - public void setMaxCommitTime(String maxCommitTime) { - this.maxCommitTime = maxCommitTime; - } - - public String getBasePath() { - return basePath; - } - - public void setBasePath(String basePath) { - this.basePath = basePath; - } - - public String getBaseFilePath() { - return baseFilePath; - } - - public void setBaseFilePath(String baseFilePath) { - this.baseFilePath = baseFilePath; - } -} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BootstrapBaseFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BootstrapBaseFileSplit.java index 437304fb043d0..6db1751771904 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BootstrapBaseFileSplit.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/BootstrapBaseFileSplit.java @@ -32,9 +32,11 @@ public class BootstrapBaseFileSplit extends FileSplit { private FileSplit bootstrapFileSplit; - public BootstrapBaseFileSplit() { - super(); - } + /** + * NOTE: This ctor is necessary for Hive to be able to serialize and + * then instantiate it when deserializing back + */ + public BootstrapBaseFileSplit() {} public BootstrapBaseFileSplit(FileSplit baseSplit, FileSplit bootstrapFileSplit) throws IOException { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieTableFileIndex.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieTableFileIndex.java index 585728d1e72b7..000fce5e8fbff 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieTableFileIndex.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HiveHoodieTableFileIndex.java @@ -20,8 +20,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; -import org.apache.hudi.HoodieTableFileIndexBase; -import org.apache.hudi.FileStatusCacheTrait; +import org.apache.hudi.BaseHoodieTableFileIndex; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.model.HoodieTableQueryType; @@ -29,15 +28,13 @@ import org.apache.hudi.common.util.Option; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import scala.Function0; -import scala.collection.JavaConverters; import java.util.List; /** - * Implementation of {@link HoodieTableFileIndexBase} for Hive-based query engines + * Implementation of {@link BaseHoodieTableFileIndex} for Hive-based query engines */ -public class HiveHoodieTableFileIndex extends HoodieTableFileIndexBase { +public class HiveHoodieTableFileIndex extends BaseHoodieTableFileIndex { public static final Logger LOG = LoggerFactory.getLogger(HiveHoodieTableFileIndex.class); @@ -53,16 +50,13 @@ public HiveHoodieTableFileIndex(HoodieEngineContext engineContext, metaClient, configProperties, queryType, - JavaConverters.asScalaBufferConverter(queryPaths).asScala(), - toScalaOption(specifiedQueryInstant), + queryPaths, + specifiedQueryInstant, shouldIncludePendingCommits, + true, new NoopCache()); } - private static scala.Option toScalaOption(Option opt) { - return scala.Option.apply(opt.orElse(null)); - } - @Override public Object[] parsePartitionColumnValues(String[] partitionColumns, String partitionPath) { // NOTE: Parsing partition path into partition column values isn't required on Hive, @@ -71,20 +65,10 @@ public Object[] parsePartitionColumnValues(String[] partitionColumns, String par return new Object[0]; } - @Override - public void logInfo(Function0 lazyStr) { - LOG.info(lazyStr.apply()); - } - - @Override - public void logWarning(Function0 lazyStr) { - LOG.info(lazyStr.apply()); - } - - static class NoopCache implements FileStatusCacheTrait { + static class NoopCache implements FileStatusCache { @Override - public scala.Option get(Path path) { - return scala.Option.empty(); + public Option get(Path path) { + return Option.empty(); } @Override diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileInputFormatBase.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieCopyOnWriteTableInputFormat.java similarity index 65% rename from hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileInputFormatBase.java rename to hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieCopyOnWriteTableInputFormat.java index a35eb50945285..2b8dae255e3c4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieFileInputFormatBase.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieCopyOnWriteTableInputFormat.java @@ -18,40 +18,47 @@ package org.apache.hudi.hadoop; -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapreduce.Job; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieTableQueryType; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; -import scala.collection.JavaConverters; -import scala.collection.Seq; +import org.apache.parquet.schema.MessageType; import javax.annotation.Nonnull; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.stream.Collectors; -import java.util.stream.Stream; import static org.apache.hudi.common.util.ValidationUtils.checkState; @@ -64,43 +71,35 @@ *
  • Incremental mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
  • *
  • External mode: reading non-Hudi partitions
  • * + * + * NOTE: This class is invariant of the underlying file-format of the files being read */ -public abstract class HoodieFileInputFormatBase extends FileInputFormat - implements Configurable { - - protected Configuration conf; - - protected abstract boolean includeLogFilesForSnapShotView(); +public class HoodieCopyOnWriteTableInputFormat extends HoodieTableInputFormat { @Override - public final Configuration getConf() { - return conf; + protected boolean isSplitable(FileSystem fs, Path filename) { + return !(filename instanceof PathWithBootstrapFileStatus); } @Override - public final void setConf(Configuration conf) { - this.conf = conf; - } + protected FileSplit makeSplit(Path file, long start, long length, + String[] hosts) { + FileSplit split = new FileSplit(file, start, length, hosts); - @Nonnull - private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieLogFile latestLogFile, Stream logFiles) { - List sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); - try { - RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(latestLogFile.getFileStatus()); - rtFileStatus.setDeltaLogFiles(sortedLogFiles); - return rtFileStatus; - } catch (IOException e) { - throw new RuntimeException(e); + if (file instanceof PathWithBootstrapFileStatus) { + return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); } + return split; } - @Nonnull - private static FileStatus getFileStatusUnchecked(Option baseFileOpt) { - try { - return HoodieInputFormatUtils.getFileStatus(baseFileOpt.get()); - } catch (IOException ioe) { - throw new RuntimeException(ioe); + @Override + protected FileSplit makeSplit(Path file, long start, long length, + String[] hosts, String[] inMemoryHosts) { + FileSplit split = new FileSplit(file, start, length, hosts, inMemoryHosts); + if (file instanceof PathWithBootstrapFileStatus) { + return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); } + return split; } @Override @@ -143,6 +142,70 @@ public FileStatus[] listStatus(JobConf job) throws IOException { return returns.toArray(new FileStatus[0]); } + @Override + public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { + throw new UnsupportedEncodingException("not implemented"); + } + + /** + * Abstracts and exposes {@link FileInputFormat#listStatus(JobConf)} operation to subclasses that + * lists files (returning an array of {@link FileStatus}) corresponding to the input paths specified + * as part of provided {@link JobConf} + */ + protected final FileStatus[] doListStatus(JobConf job) throws IOException { + return super.listStatus(job); + } + + /** + * Achieves listStatus functionality for an incrementally queried table. Instead of listing all + * partitions and then filtering based on the commits of interest, this logic first extracts the + * partitions touched by the desired commits and then lists only those partitions. + */ + protected List listStatusForIncrementalMode(JobConf job, + HoodieTableMetaClient tableMetaClient, + List inputPaths, + String incrementalTable) throws IOException { + Job jobContext = Job.getInstance(job); + Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); + if (!timeline.isPresent()) { + return null; + } + Option> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, incrementalTable, timeline.get()); + if (!commitsToCheck.isPresent()) { + return null; + } + Option incrementalInputPaths = HoodieInputFormatUtils.getAffectedPartitions(commitsToCheck.get(), tableMetaClient, timeline.get(), inputPaths); + // Mutate the JobConf to set the input paths to only partitions touched by incremental pull. + if (!incrementalInputPaths.isPresent()) { + return null; + } + setInputPaths(job, incrementalInputPaths.get()); + FileStatus[] fileStatuses = doListStatus(job); + return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get()); + } + + protected FileStatus createFileStatusUnchecked(FileSlice fileSlice, HiveHoodieTableFileIndex fileIndex, Option virtualKeyInfoOpt) { + Option baseFileOpt = fileSlice.getBaseFile(); + + if (baseFileOpt.isPresent()) { + return getFileStatusUnchecked(baseFileOpt.get()); + } else { + throw new IllegalStateException("Invalid state: base-file has to be present"); + } + } + + private BootstrapBaseFileSplit makeExternalFileSplit(PathWithBootstrapFileStatus file, FileSplit split) { + try { + LOG.info("Making external data split for " + file); + FileStatus externalFileStatus = file.getBootstrapFileStatus(); + FileSplit externalFileSplit = makeSplit(externalFileStatus.getPath(), 0, externalFileStatus.getLen(), + new String[0], new String[0]); + return new BootstrapBaseFileSplit(split, externalFileSplit); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + } + @Nonnull private List listStatusForSnapshotMode(JobConf job, Map tableMetaClientMap, @@ -172,36 +235,24 @@ private List listStatusForSnapshotMode(JobConf job, engineContext, tableMetaClient, props, - HoodieTableQueryType.QUERY_TYPE_SNAPSHOT, + HoodieTableQueryType.SNAPSHOT, partitionPaths, queryCommitInstant, shouldIncludePendingCommits); - Map> partitionedFileSlices = - JavaConverters.mapAsJavaMapConverter(fileIndex.listFileSlices()).asJava(); + Map> partitionedFileSlices = fileIndex.listFileSlices(); + + Option virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient); targetFiles.addAll( partitionedFileSlices.values() .stream() - .flatMap(seq -> JavaConverters.seqAsJavaListConverter(seq).asJava().stream()) - .map(fileSlice -> { - Option baseFileOpt = fileSlice.getBaseFile(); - Option latestLogFileOpt = fileSlice.getLatestLogFile(); - if (baseFileOpt.isPresent()) { - return getFileStatusUnchecked(baseFileOpt); - } else if (includeLogFilesForSnapShotView() && latestLogFileOpt.isPresent()) { - return createRealtimeFileStatusUnchecked(latestLogFileOpt.get(), fileSlice.getLogFiles()); - } else { - throw new IllegalStateException("Invalid state: either base-file or log-file should be present"); - } - }) + .flatMap(Collection::stream) + .map(fileSlice -> createFileStatusUnchecked(fileSlice, fileIndex, virtualKeyInfoOpt)) .collect(Collectors.toList()) ); } - // TODO cleanup - validate(targetFiles, listStatusForSnapshotModeLegacy(job, tableMetaClientMap, snapshotPaths)); - return targetFiles; } @@ -211,42 +262,28 @@ private void validate(List targetFiles, List legacyFileS } @Nonnull - private List listStatusForSnapshotModeLegacy(JobConf job, Map tableMetaClientMap, List snapshotPaths) throws IOException { - return HoodieInputFormatUtils.filterFileStatusForSnapshotMode(job, tableMetaClientMap, snapshotPaths, includeLogFilesForSnapShotView()); - } - - /** - * Abstracts and exposes {@link FileInputFormat#listStatus(JobConf)} operation to subclasses that - * lists files (returning an array of {@link FileStatus}) corresponding to the input paths specified - * as part of provided {@link JobConf} - */ - protected final FileStatus[] doListStatus(JobConf job) throws IOException { - return super.listStatus(job); + protected static FileStatus getFileStatusUnchecked(HoodieBaseFile baseFile) { + try { + return HoodieInputFormatUtils.getFileStatus(baseFile); + } catch (IOException ioe) { + throw new HoodieIOException("Failed to get file-status", ioe); + } } - /** - * Achieves listStatus functionality for an incrementally queried table. Instead of listing all - * partitions and then filtering based on the commits of interest, this logic first extracts the - * partitions touched by the desired commits and then lists only those partitions. - */ - protected List listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient, - List inputPaths, String incrementalTable) throws IOException { - Job jobContext = Job.getInstance(job); - Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); - if (!timeline.isPresent()) { - return null; + protected static Option getHoodieVirtualKeyInfo(HoodieTableMetaClient metaClient) { + HoodieTableConfig tableConfig = metaClient.getTableConfig(); + if (tableConfig.populateMetaFields()) { + return Option.empty(); } - Option> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, incrementalTable, timeline.get()); - if (!commitsToCheck.isPresent()) { - return null; - } - Option incrementalInputPaths = HoodieInputFormatUtils.getAffectedPartitions(commitsToCheck.get(), tableMetaClient, timeline.get(), inputPaths); - // Mutate the JobConf to set the input paths to only partitions touched by incremental pull. - if (!incrementalInputPaths.isPresent()) { - return null; + + TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); + try { + MessageType parquetSchema = tableSchemaResolver.getTableParquetSchema(); + return Option.of(new HoodieVirtualKeyInfo(tableConfig.getRecordKeyFieldProp(), + tableConfig.getPartitionFieldProp(), parquetSchema.getFieldIndex(tableConfig.getRecordKeyFieldProp()), + parquetSchema.getFieldIndex(tableConfig.getPartitionFieldProp()))); + } catch (Exception exception) { + throw new HoodieException("Fetching table schema failed with exception ", exception); } - setInputPaths(job, incrementalInputPaths.get()); - FileStatus[] fileStatuses = doListStatus(job); - return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get()); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileInputFormat.java index 2baf140e21138..6eb1663a0d12c 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieHFileInputFormat.java @@ -35,17 +35,12 @@ * HoodieInputFormat for HUDI datasets which store data in HFile base file format. */ @UseFileSplitsFromInputFormat -public class HoodieHFileInputFormat extends HoodieFileInputFormatBase { +public class HoodieHFileInputFormat extends HoodieCopyOnWriteTableInputFormat { protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { return HoodieInputFormatUtils.filterInstantsTimeline(timeline); } - @Override - protected boolean includeLogFilesForSnapShotView() { - return false; - } - @Override public RecordReader getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index f63352829faf9..7b79f61e49bcf 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -18,18 +18,6 @@ package org.apache.hudi.hadoop; -import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; -import org.apache.hudi.common.util.collection.Pair; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hadoop.utils.HoodieHiveUtils; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; - -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.io.ArrayWritable; @@ -39,6 +27,9 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.hadoop.utils.HoodieHiveUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -54,20 +45,16 @@ */ @UseRecordReaderFromInputFormat @UseFileSplitsFromInputFormat -public class HoodieParquetInputFormat extends HoodieFileInputFormatBase implements Configurable { +public class HoodieParquetInputFormat extends HoodieParquetInputFormatBase { private static final Logger LOG = LogManager.getLogger(HoodieParquetInputFormat.class); - // NOTE: We're only using {@code MapredParquetInputFormat} to compose vectorized - // {@code RecordReader} - private final MapredParquetInputFormat mapredParquetInputFormat = new MapredParquetInputFormat(); - - protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { - return HoodieInputFormatUtils.filterInstantsTimeline(timeline); + public HoodieParquetInputFormat() { + super(new HoodieCopyOnWriteTableInputFormat()); } - protected boolean includeLogFilesForSnapShotView() { - return false; + protected HoodieParquetInputFormat(HoodieCopyOnWriteTableInputFormat delegate) { + super(delegate); } @Override @@ -96,36 +83,10 @@ public RecordReader getRecordReader(final InputSpli return getRecordReaderInternal(split, job, reporter); } - @Override - protected boolean isSplitable(FileSystem fs, Path filename) { - return !(filename instanceof PathWithBootstrapFileStatus); - } - - @Override - protected FileSplit makeSplit(Path file, long start, long length, - String[] hosts) { - FileSplit split = new FileSplit(file, start, length, hosts); - - if (file instanceof PathWithBootstrapFileStatus) { - return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); - } - return split; - } - - @Override - protected FileSplit makeSplit(Path file, long start, long length, - String[] hosts, String[] inMemoryHosts) { - FileSplit split = new FileSplit(file, start, length, hosts, inMemoryHosts); - if (file instanceof PathWithBootstrapFileStatus) { - return makeExternalFileSplit((PathWithBootstrapFileStatus)file, split); - } - return split; - } - private RecordReader getRecordReaderInternal(InputSplit split, JobConf job, Reporter reporter) throws IOException { - return mapredParquetInputFormat.getRecordReader(split, job, reporter); + return super.getRecordReader(split, job, reporter); } private RecordReader createBootstrappingRecordReader(InputSplit split, @@ -176,16 +137,4 @@ private RecordReader createBootstrappingRecordReade true); } } - - private BootstrapBaseFileSplit makeExternalFileSplit(PathWithBootstrapFileStatus file, FileSplit split) { - try { - LOG.info("Making external data split for " + file); - FileStatus externalFileStatus = file.getBootstrapFileStatus(); - FileSplit externalFileSplit = makeSplit(externalFileStatus.getPath(), 0, externalFileStatus.getLen(), - new String[0], new String[0]); - return new BootstrapBaseFileSplit(split, externalFileSplit); - } catch (IOException e) { - throw new HoodieIOException(e.getMessage(), e); - } - } } \ No newline at end of file diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormatBase.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormatBase.java new file mode 100644 index 0000000000000..ed88acacb4d2f --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormatBase.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hudi.hadoop.realtime.HoodieMergeOnReadTableInputFormat; + +import java.io.IOException; + +/** + * !!! PLEASE READ CAREFULLY !!! + * + * NOTE: Hive bears optimizations which are based upon validating whether {@link FileInputFormat} + * implementation inherits from {@link MapredParquetInputFormat}. + * + * To make sure that Hudi implementations are leveraging these optimizations to the fullest, this class + * serves as a base-class for every {@link FileInputFormat} implementations working with Parquet file-format. + * + * However, this class serves as a simple delegate to the actual implementation hierarchy: it expects + * either {@link HoodieCopyOnWriteTableInputFormat} or {@link HoodieMergeOnReadTableInputFormat} to be supplied + * to which it delegates all of its necessary methods. + */ +public abstract class HoodieParquetInputFormatBase extends MapredParquetInputFormat implements Configurable { + + private final HoodieTableInputFormat inputFormatDelegate; + + protected HoodieParquetInputFormatBase(HoodieCopyOnWriteTableInputFormat inputFormatDelegate) { + this.inputFormatDelegate = inputFormatDelegate; + } + + @Override + public final void setConf(Configuration conf) { + inputFormatDelegate.setConf(conf); + } + + @Override + public final Configuration getConf() { + return inputFormatDelegate.getConf(); + } + + @Override + public final InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { + return inputFormatDelegate.getSplits(job, numSplits); + } + + @Override + protected final boolean isSplitable(FileSystem fs, Path filename) { + return inputFormatDelegate.isSplitable(fs, filename); + } + + @Override + protected final FileSplit makeSplit(Path file, long start, long length, + String[] hosts) { + return inputFormatDelegate.makeSplit(file, start, length, hosts); + } + + @Override + protected final FileSplit makeSplit(Path file, long start, long length, + String[] hosts, String[] inMemoryHosts) { + return inputFormatDelegate.makeSplit(file, start, length, hosts, inMemoryHosts); + } + + @Override + public final FileStatus[] listStatus(JobConf job) throws IOException { + return inputFormatDelegate.listStatus(job); + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieTableInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieTableInputFormat.java new file mode 100644 index 0000000000000..d18cb7895ad00 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieTableInputFormat.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; + +import java.io.IOException; + +/** + * Abstract base class of the Hive's {@link FileInputFormat} implementations allowing for reading of Hudi's + * Copy-on-Write (COW) and Merge-on-Read (MOR) tables + */ +public abstract class HoodieTableInputFormat extends FileInputFormat + implements Configurable { + + protected Configuration conf; + + @Override + public final Configuration getConf() { + return conf; + } + + @Override + public final void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + protected boolean isSplitable(FileSystem fs, Path filename) { + return super.isSplitable(fs, filename); + } + + @Override + protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) { + return super.makeSplit(file, start, length, hosts); + } + + @Override + protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) { + return super.makeSplit(file, start, length, hosts, inMemoryHosts); + } + + @Override + protected FileStatus[] listStatus(JobConf job) throws IOException { + return super.listStatus(job); + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java index 07bd82afa9e9e..24d190700fea3 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java @@ -35,7 +35,7 @@ import java.util.List; import java.util.Map; -import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getTableMetaClientForBasePath; +import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getTableMetaClientForBasePathUnchecked; /** * InputPathHandler takes in a set of input paths and incremental tables list. Then, classifies the @@ -107,7 +107,7 @@ private void parseInputPaths(Path[] inputPaths, List incrementalTables) // This path is for a table that we don't know about yet. HoodieTableMetaClient metaClient; try { - metaClient = getTableMetaClientForBasePath(inputPath.getFileSystem(conf), inputPath); + metaClient = getTableMetaClientForBasePathUnchecked(conf, inputPath); tableMetaClientMap.put(getIncrementalTable(metaClient), metaClient); tagAsIncrementalOrSnapshot(inputPath, metaClient, incrementalTables); } catch (TableNotFoundException | InvalidTableException e) { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/PathWithLogFilePath.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/PathWithLogFilePath.java deleted file mode 100644 index 8f9ac8b03d575..0000000000000 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/PathWithLogFilePath.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.hadoop; - -import org.apache.hudi.common.model.HoodieLogFile; - -import org.apache.hadoop.fs.Path; - -import java.util.ArrayList; -import java.util.List; - -/** - * Encode additional information in Path to track matching log file and base files. - * Hence, this class tracks a log/base file status. - */ -public class PathWithLogFilePath extends Path { - // a flag to mark this split is produced by incremental query or not. - private boolean belongToIncrementalPath = false; - // the log files belong this path. - private List deltaLogFiles = new ArrayList<>(); - // max commit time of current path. - private String maxCommitTime = ""; - // the basePath of current hoodie table. - private String basePath = ""; - // the base file belong to this path; - private String baseFilePath = ""; - // the bootstrap file belong to this path. - // only if current query table is bootstrap table, this field is used. - private PathWithBootstrapFileStatus pathWithBootstrapFileStatus; - - public PathWithLogFilePath(Path parent, String child) { - super(parent, child); - } - - public void setBelongToIncrementalPath(boolean belongToIncrementalPath) { - this.belongToIncrementalPath = belongToIncrementalPath; - } - - public List getDeltaLogFiles() { - return deltaLogFiles; - } - - public void setDeltaLogFiles(List deltaLogFiles) { - this.deltaLogFiles = deltaLogFiles; - } - - public String getMaxCommitTime() { - return maxCommitTime; - } - - public void setMaxCommitTime(String maxCommitTime) { - this.maxCommitTime = maxCommitTime; - } - - public String getBasePath() { - return basePath; - } - - public void setBasePath(String basePath) { - this.basePath = basePath; - } - - public void setBaseFilePath(String baseFilePath) { - this.baseFilePath = baseFilePath; - } - - public boolean splitable() { - return !baseFilePath.isEmpty(); - } - - public PathWithBootstrapFileStatus getPathWithBootstrapFileStatus() { - return pathWithBootstrapFileStatus; - } - - public void setPathWithBootstrapFileStatus(PathWithBootstrapFileStatus pathWithBootstrapFileStatus) { - this.pathWithBootstrapFileStatus = pathWithBootstrapFileStatus; - } - - public boolean includeBootstrapFilePath() { - return pathWithBootstrapFileStatus != null; - } - - public BaseFileWithLogsSplit buildSplit(Path file, long start, long length, String[] hosts) { - BaseFileWithLogsSplit bs = new BaseFileWithLogsSplit(file, start, length, hosts); - bs.setBelongToIncrementalSplit(belongToIncrementalPath); - bs.setDeltaLogFiles(deltaLogFiles); - bs.setMaxCommitTime(maxCommitTime); - bs.setBasePath(basePath); - bs.setBaseFilePath(baseFilePath); - return bs; - } -} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RealtimeFileStatus.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RealtimeFileStatus.java index e8e1a28987c56..641aa2759ff20 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RealtimeFileStatus.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/RealtimeFileStatus.java @@ -18,13 +18,14 @@ package org.apache.hudi.hadoop; -import org.apache.hudi.common.model.HoodieLogFile; - import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.realtime.HoodieRealtimePath; +import org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo; import java.io.IOException; -import java.util.ArrayList; import java.util.List; /** @@ -34,51 +35,62 @@ * in Path. */ public class RealtimeFileStatus extends FileStatus { - // a flag to mark this split is produced by incremental query or not. - private boolean belongToIncrementalFileStatus = false; - // the log files belong this fileStatus. - private List deltaLogFiles = new ArrayList<>(); - // max commit time of current fileStatus. + /** + * Base path of the table this path belongs to + */ + private final String basePath; + /** + * List of delta log-files holding updated records for this base-file + */ + private final List deltaLogFiles; + /** + * Marks whether this path produced as part of Incremental Query + */ + private final boolean belongsToIncrementalQuery; + /** + * Latest commit instant available at the time of the query in which all of the files + * pertaining to this split are represented + */ private String maxCommitTime = ""; - // the basePath of current hoodie table. - private String basePath = ""; - // the base file belong to this status; - private String baseFilePath = ""; - // the bootstrap file belong to this status. - // only if current query table is bootstrap table, this field is used. + /** + * File status for the Bootstrap file (only relevant if this table is a bootstrapped table + */ private FileStatus bootStrapFileStatus; - - public RealtimeFileStatus(FileStatus fileStatus) throws IOException { + /** + * Virtual key configuration of the table this split belongs to + */ + private final Option virtualKeyInfo; + + public RealtimeFileStatus(FileStatus fileStatus, + String basePath, + List deltaLogFiles, + boolean belongsToIncrementalQuery, + Option virtualKeyInfo) throws IOException { super(fileStatus); + this.basePath = basePath; + this.deltaLogFiles = deltaLogFiles; + this.belongsToIncrementalQuery = belongsToIncrementalQuery; + this.virtualKeyInfo = virtualKeyInfo; } @Override public Path getPath() { Path path = super.getPath(); - PathWithLogFilePath pathWithLogFilePath = new PathWithLogFilePath(path.getParent(), path.getName()); - pathWithLogFilePath.setBelongToIncrementalPath(belongToIncrementalFileStatus); - pathWithLogFilePath.setDeltaLogFiles(deltaLogFiles); - pathWithLogFilePath.setMaxCommitTime(maxCommitTime); - pathWithLogFilePath.setBasePath(basePath); - pathWithLogFilePath.setBaseFilePath(baseFilePath); + + HoodieRealtimePath realtimePath = new HoodieRealtimePath(path.getParent(), path.getName(), basePath, + deltaLogFiles, maxCommitTime, belongsToIncrementalQuery, virtualKeyInfo); + if (bootStrapFileStatus != null) { - pathWithLogFilePath.setPathWithBootstrapFileStatus((PathWithBootstrapFileStatus)bootStrapFileStatus.getPath()); + realtimePath.setPathWithBootstrapFileStatus((PathWithBootstrapFileStatus)bootStrapFileStatus.getPath()); } - return pathWithLogFilePath; - } - public void setBelongToIncrementalFileStatus(boolean belongToIncrementalFileStatus) { - this.belongToIncrementalFileStatus = belongToIncrementalFileStatus; + return realtimePath; } public List getDeltaLogFiles() { return deltaLogFiles; } - public void setDeltaLogFiles(List deltaLogFiles) { - this.deltaLogFiles = deltaLogFiles; - } - public String getMaxCommitTime() { return maxCommitTime; } @@ -87,18 +99,6 @@ public void setMaxCommitTime(String maxCommitTime) { this.maxCommitTime = maxCommitTime; } - public String getBasePath() { - return basePath; - } - - public void setBasePath(String basePath) { - this.basePath = basePath; - } - - public void setBaseFilePath(String baseFilePath) { - this.baseFilePath = baseFilePath; - } - public void setBootStrapFileStatus(FileStatus bootStrapFileStatus) { this.bootStrapFileStatus = bootStrapFileStatus; } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java index c24c75359f588..8736883cea72c 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/hive/HoodieCombineHiveInputFormat.java @@ -18,12 +18,6 @@ package org.apache.hudi.hadoop.hive; -import org.apache.hudi.common.util.ReflectionUtils; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.hadoop.HoodieParquetInputFormat; -import org.apache.hudi.hadoop.realtime.HoodieCombineRealtimeRecordReader; -import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; @@ -62,6 +56,13 @@ import org.apache.hadoop.mapred.lib.CombineFileInputFormat; import org.apache.hadoop.mapred.lib.CombineFileSplit; import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.HoodieParquetInputFormatBase; +import org.apache.hudi.hadoop.realtime.HoodieCombineRealtimeRecordReader; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -876,7 +877,7 @@ protected List listStatus(JobContext job) throws IOException { LOG.info("Listing status in HoodieCombineHiveInputFormat.HoodieCombineFileInputFormatShim"); List result; if (hoodieFilter) { - HoodieParquetInputFormat input; + HoodieParquetInputFormatBase input; if (isRealTime) { LOG.info("Using HoodieRealtimeInputFormat"); input = createParquetRealtimeInputFormat(); @@ -916,7 +917,7 @@ public CombineFileSplit[] getSplits(JobConf job, int numSplits) throws IOExcepti job.set("hudi.hive.realtime", "true"); InputSplit[] splits; if (hoodieFilter) { - HoodieParquetInputFormat input = createParquetRealtimeInputFormat(); + HoodieParquetRealtimeInputFormat input = createParquetRealtimeInputFormat(); input.setConf(job); splits = input.getSplits(job, numSplits); } else { diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java index 78ac8805d8aaf..030e20f2278b4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java @@ -19,6 +19,7 @@ package org.apache.hudi.hadoop.realtime; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodiePayloadProps; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.log.LogReaderUtils; import org.apache.hudi.exception.HoodieIOException; @@ -39,6 +40,7 @@ import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.stream.Collectors; /** @@ -50,6 +52,7 @@ public abstract class AbstractRealtimeRecordReader { protected final RealtimeSplit split; protected final JobConf jobConf; protected final boolean usesCustomPayload; + protected Properties payloadProps = new Properties(); // Schema handles private Schema readerSchema; private Schema writerSchema; @@ -62,7 +65,11 @@ public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) { LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); LOG.info("partitioningColumns ==> " + job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "")); try { - this.usesCustomPayload = usesCustomPayload(); + HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(split.getBasePath()).build(); + if (metaClient.getTableConfig().getPreCombineField() != null) { + this.payloadProps.setProperty(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, metaClient.getTableConfig().getPreCombineField()); + } + this.usesCustomPayload = usesCustomPayload(metaClient); LOG.info("usesCustomPayload ==> " + this.usesCustomPayload); init(); } catch (IOException e) { @@ -70,8 +77,7 @@ public AbstractRealtimeRecordReader(RealtimeSplit split, JobConf job) { } } - private boolean usesCustomPayload() { - HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jobConf).setBasePath(split.getBasePath()).build(); + private boolean usesCustomPayload(HoodieTableMetaClient metaClient) { return !(metaClient.getTableConfig().getPayloadClass().contains(HoodieAvroPayload.class.getName()) || metaClient.getTableConfig().getPayloadClass().contains("org.apache.hudi.OverwriteWithLatestAvroPayload")); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java index 525bec61333e4..799d90bce5df4 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieHFileRealtimeInputFormat.java @@ -18,15 +18,15 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; -import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.hadoop.HoodieHFileInputFormat; @@ -38,29 +38,18 @@ import org.apache.log4j.Logger; import java.io.IOException; -import java.util.Arrays; -import java.util.stream.Stream; /** * HoodieRealtimeInputFormat for HUDI datasets which store data in HFile base file format. */ @UseRecordReaderFromInputFormat @UseFileSplitsFromInputFormat -public class HoodieHFileRealtimeInputFormat extends HoodieHFileInputFormat { +public class HoodieHFileRealtimeInputFormat extends HoodieMergeOnReadTableInputFormat { private static final Logger LOG = LogManager.getLogger(HoodieHFileRealtimeInputFormat.class); - @Override - public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - Stream fileSplits = Arrays.stream(super.getSplits(job, numSplits)).map(is -> (FileSplit) is); - return HoodieRealtimeInputFormatUtils.getRealtimeSplits(job, fileSplits); - } - - @Override - protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { - // no specific filtering for Realtime format - return timeline; - } + // NOTE: We're only using {@code HoodieHFileInputFormat} to compose {@code RecordReader} + private final HoodieHFileInputFormat hFileInputFormat = new HoodieHFileInputFormat(); @Override public RecordReader getRecordReader(final InputSplit split, final JobConf jobConf, @@ -99,6 +88,12 @@ public RecordReader getRecordReader(final InputSpli "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split); return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, jobConf, - super.getRecordReader(split, jobConf, reporter)); + hFileInputFormat.getRecordReader(split, jobConf, reporter)); + } + + @Override + protected boolean isSplitable(FileSystem fs, Path filename) { + // This file isn't splittable. + return false; } } \ No newline at end of file diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java new file mode 100644 index 0000000000000..982d52b0d4807 --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieMergeOnReadTableInputFormat.java @@ -0,0 +1,375 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.SplitLocationInfo; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.BootstrapBaseFileSplit; +import org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile; +import org.apache.hudi.hadoop.HiveHoodieTableFileIndex; +import org.apache.hudi.hadoop.HoodieCopyOnWriteTableInputFormat; +import org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile; +import org.apache.hudi.hadoop.RealtimeFileStatus; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.hudi.common.util.ValidationUtils.checkState; + +/** + * Base implementation of the Hive's {@link FileInputFormat} allowing for reading of Hudi's + * Merge-on-Read (COW) tables in various configurations: + * + *
      + *
    • Snapshot mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
    • + *
    • Incremental mode: reading table's state as of particular timestamp (or instant, in Hudi's terms)
    • + *
    • External mode: reading non-Hudi partitions
    • + *
    + *

    + * NOTE: This class is invariant of the underlying file-format of the files being read + */ +public class HoodieMergeOnReadTableInputFormat extends HoodieCopyOnWriteTableInputFormat implements Configurable { + + @Override + public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { + List fileSplits = Arrays.stream(super.getSplits(job, numSplits)) + .map(is -> (FileSplit) is) + .collect(Collectors.toList()); + + return (containsIncrementalQuerySplits(fileSplits) ? filterIncrementalQueryFileSplits(fileSplits) : fileSplits) + .toArray(new FileSplit[0]); + } + + @Override + protected FileStatus createFileStatusUnchecked(FileSlice fileSlice, HiveHoodieTableFileIndex fileIndex, Option virtualKeyInfoOpt) { + Option baseFileOpt = fileSlice.getBaseFile(); + Option latestLogFileOpt = fileSlice.getLatestLogFile(); + Stream logFiles = fileSlice.getLogFiles(); + + Option latestCompletedInstantOpt = fileIndex.getLatestCompletedInstant(); + String tableBasePath = fileIndex.getBasePath(); + + // Check if we're reading a MOR table + if (baseFileOpt.isPresent()) { + return createRealtimeFileStatusUnchecked(baseFileOpt.get(), logFiles, tableBasePath, latestCompletedInstantOpt, virtualKeyInfoOpt); + } else if (latestLogFileOpt.isPresent()) { + return createRealtimeFileStatusUnchecked(latestLogFileOpt.get(), logFiles, tableBasePath, latestCompletedInstantOpt, virtualKeyInfoOpt); + } else { + throw new IllegalStateException("Invalid state: either base-file or log-file has to be present"); + } + } + + /** + * Keep the logic of mor_incr_view as same as spark datasource. + * Step1: Get list of commits to be fetched based on start commit and max commits(for snapshot max commits is -1). + * Step2: Get list of affected files status for these affected file status. + * Step3: Construct HoodieTableFileSystemView based on those affected file status. + * a. Filter affected partitions based on inputPaths. + * b. Get list of fileGroups based on affected partitions by fsView.getAllFileGroups. + * Step4: Set input paths based on filtered affected partition paths. changes that amony original input paths passed to + * this method. some partitions did not have commits as part of the trimmed down list of commits and hence we need this step. + * Step5: Find candidate fileStatus, since when we get baseFileStatus from HoodieTableFileSystemView, + * the BaseFileStatus will missing file size information. + * We should use candidate fileStatus to update the size information for BaseFileStatus. + * Step6: For every file group from step3(b) + * Get 1st available base file from all file slices. then we use candidate file status to update the baseFileStatus, + * and construct RealTimeFileStatus and add it to result along with log files. + * If file group just has log files, construct RealTimeFileStatus and add it to result. + * TODO: unify the incremental view code between hive/spark-sql and spark datasource + */ + @Override + protected List listStatusForIncrementalMode(JobConf job, + HoodieTableMetaClient tableMetaClient, + List inputPaths, + String incrementalTableName) throws IOException { + List result = new ArrayList<>(); + Job jobContext = Job.getInstance(job); + + // step1 + Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); + if (!timeline.isPresent()) { + return result; + } + HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, incrementalTableName, timeline.get()); + Option> commitsToCheck = Option.of(commitsTimelineToReturn.getInstants().collect(Collectors.toList())); + if (!commitsToCheck.isPresent()) { + return result; + } + // step2 + commitsToCheck.get().sort(HoodieInstant::compareTo); + List metadataList = commitsToCheck + .get().stream().map(instant -> { + try { + return HoodieInputFormatUtils.getCommitMetadata(instant, commitsTimelineToReturn); + } catch (IOException e) { + throw new HoodieException(String.format("cannot get metadata for instant: %s", instant)); + } + }).collect(Collectors.toList()); + + // build fileGroup from fsView + List affectedFileStatus = Arrays.asList(HoodieInputFormatUtils + .listAffectedFilesForCommits(job, new Path(tableMetaClient.getBasePath()), metadataList)); + // step3 + HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(tableMetaClient, commitsTimelineToReturn, affectedFileStatus.toArray(new FileStatus[0])); + // build fileGroup from fsView + Path basePath = new Path(tableMetaClient.getBasePath()); + // filter affectedPartition by inputPaths + List affectedPartition = HoodieInputFormatUtils.getWritePartitionPaths(metadataList).stream() + .filter(k -> k.isEmpty() ? inputPaths.contains(basePath) : inputPaths.contains(new Path(basePath, k))).collect(Collectors.toList()); + if (affectedPartition.isEmpty()) { + return result; + } + List fileGroups = affectedPartition.stream() + .flatMap(partitionPath -> fsView.getAllFileGroups(partitionPath)).collect(Collectors.toList()); + // step4 + setInputPaths(job, affectedPartition.stream() + .map(p -> p.isEmpty() ? basePath.toString() : new Path(basePath, p).toString()).collect(Collectors.joining(","))); + + // step5 + // find all file status in partitionPaths. + FileStatus[] fileStatuses = doListStatus(job); + Map candidateFileStatus = new HashMap<>(); + for (int i = 0; i < fileStatuses.length; i++) { + String key = fileStatuses[i].getPath().toString(); + candidateFileStatus.put(key, fileStatuses[i]); + } + + Option virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient); + String maxCommitTime = fsView.getLastInstant().get().getTimestamp(); + // step6 + result.addAll(collectAllIncrementalFiles(fileGroups, maxCommitTime, basePath.toString(), candidateFileStatus, virtualKeyInfoOpt)); + return result; + } + + @Override + protected boolean isSplitable(FileSystem fs, Path filename) { + if (filename instanceof HoodieRealtimePath) { + return ((HoodieRealtimePath) filename).isSplitable(); + } + + return super.isSplitable(fs, filename); + } + + // make split for path. + // When query the incremental view, the read files may be bootstrap files, we wrap those bootstrap files into + // PathWithLogFilePath, so those bootstrap files should be processed int this function. + @Override + protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) { + if (file instanceof HoodieRealtimePath) { + return doMakeSplitForRealtimePath((HoodieRealtimePath) file, start, length, hosts, null); + } + return super.makeSplit(file, start, length, hosts); + } + + @Override + protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) { + if (file instanceof HoodieRealtimePath) { + return doMakeSplitForRealtimePath((HoodieRealtimePath) file, start, length, hosts, inMemoryHosts); + } + return super.makeSplit(file, start, length, hosts, inMemoryHosts); + } + + private static List collectAllIncrementalFiles(List fileGroups, + String maxCommitTime, + String basePath, + Map candidateFileStatus, + Option virtualKeyInfoOpt) { + + List result = new ArrayList<>(); + fileGroups.stream().forEach(f -> { + try { + List baseFiles = f.getAllFileSlices().filter(slice -> slice.getBaseFile().isPresent()).collect(Collectors.toList()); + if (!baseFiles.isEmpty()) { + FileStatus baseFileStatus = HoodieInputFormatUtils.getFileStatus(baseFiles.get(0).getBaseFile().get()); + String baseFilePath = baseFileStatus.getPath().toUri().toString(); + if (!candidateFileStatus.containsKey(baseFilePath)) { + throw new HoodieException("Error obtaining fileStatus for file: " + baseFilePath); + } + List deltaLogFiles = f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList()); + // We cannot use baseFileStatus.getPath() here, since baseFileStatus.getPath() missing file size information. + // So we use candidateFileStatus.get(baseFileStatus.getPath()) to get a correct path. + RealtimeFileStatus fileStatus = new RealtimeFileStatus(candidateFileStatus.get(baseFilePath), + basePath, deltaLogFiles, true, virtualKeyInfoOpt); + fileStatus.setMaxCommitTime(maxCommitTime); + if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) { + fileStatus.setBootStrapFileStatus(baseFileStatus); + } + result.add(fileStatus); + } + // add file group which has only logs. + if (f.getLatestFileSlice().isPresent() && baseFiles.isEmpty()) { + List logFileStatus = f.getLatestFileSlice().get().getLogFiles().map(logFile -> logFile.getFileStatus()).collect(Collectors.toList()); + if (logFileStatus.size() > 0) { + List deltaLogFiles = logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList()); + RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0), basePath, + deltaLogFiles, true, virtualKeyInfoOpt); + fileStatus.setMaxCommitTime(maxCommitTime); + result.add(fileStatus); + } + } + } catch (IOException e) { + throw new HoodieException("Error obtaining data file/log file grouping ", e); + } + }); + return result; + } + + private FileSplit doMakeSplitForRealtimePath(HoodieRealtimePath path, long start, long length, String[] hosts, String[] inMemoryHosts) { + if (path.includeBootstrapFilePath()) { + FileSplit bf = + inMemoryHosts == null + ? super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts) + : super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts, inMemoryHosts); + return createRealtimeBoostrapBaseFileSplit( + (BootstrapBaseFileSplit) bf, + path.getBasePath(), + path.getDeltaLogFiles(), + path.getMaxCommitTime(), + path.getBelongsToIncrementalQuery(), + path.getVirtualKeyInfo() + ); + } + + return createRealtimeFileSplit(path, start, length, hosts); + } + + private static boolean containsIncrementalQuerySplits(List fileSplits) { + return fileSplits.stream().anyMatch(HoodieRealtimeInputFormatUtils::doesBelongToIncrementalQuery); + } + + private static List filterIncrementalQueryFileSplits(List fileSplits) { + return fileSplits.stream().filter(HoodieRealtimeInputFormatUtils::doesBelongToIncrementalQuery) + .collect(Collectors.toList()); + } + + private static HoodieRealtimeFileSplit createRealtimeFileSplit(HoodieRealtimePath path, long start, long length, String[] hosts) { + try { + return new HoodieRealtimeFileSplit(new FileSplit(path, start, length, hosts), path); + } catch (IOException e) { + throw new HoodieIOException(String.format("Failed to create instance of %s", HoodieRealtimeFileSplit.class.getName()), e); + } + } + + private static HoodieRealtimeBootstrapBaseFileSplit createRealtimeBoostrapBaseFileSplit(BootstrapBaseFileSplit split, + String basePath, + List logFiles, + String maxInstantTime, + boolean belongsToIncrementalQuery, + Option virtualKeyInfoOpt) { + try { + String[] hosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo()) + .filter(x -> !x.isInMemory()).toArray(String[]::new) : new String[0]; + String[] inMemoryHosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo()) + .filter(SplitLocationInfo::isInMemory).toArray(String[]::new) : new String[0]; + FileSplit baseSplit = new FileSplit(split.getPath(), split.getStart(), split.getLength(), + hosts, inMemoryHosts); + return new HoodieRealtimeBootstrapBaseFileSplit(baseSplit, basePath, logFiles, maxInstantTime, split.getBootstrapFileSplit(), + belongsToIncrementalQuery, virtualKeyInfoOpt); + } catch (IOException e) { + throw new HoodieIOException("Error creating hoodie real time split ", e); + } + } + + /** + * Creates {@link RealtimeFileStatus} for the file-slice where base file is present + */ + private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieBaseFile baseFile, + Stream logFiles, + String basePath, + Option latestCompletedInstantOpt, + Option virtualKeyInfoOpt) { + FileStatus baseFileStatus = getFileStatusUnchecked(baseFile); + List sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); + + try { + RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(baseFileStatus, basePath, sortedLogFiles, + false, virtualKeyInfoOpt); + + if (latestCompletedInstantOpt.isPresent()) { + HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get(); + checkState(latestCompletedInstant.isCompleted()); + + rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp()); + } + + if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) { + rtFileStatus.setBootStrapFileStatus(baseFileStatus); + } + + return rtFileStatus; + } catch (IOException e) { + throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e); + } + } + + /** + * Creates {@link RealtimeFileStatus} for the file-slice where base file is NOT present + */ + private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieLogFile latestLogFile, + Stream logFiles, + String basePath, + Option latestCompletedInstantOpt, + Option virtualKeyInfoOpt) { + List sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); + try { + RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(latestLogFile.getFileStatus(), basePath, + sortedLogFiles, false, virtualKeyInfoOpt); + + if (latestCompletedInstantOpt.isPresent()) { + HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get(); + checkState(latestCompletedInstant.isCompleted()); + + rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp()); + } + + return rtFileStatus; + } catch (IOException e) { + throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e); + } + } +} + diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java index f3cf4ffa86578..e8c806ed2cf67 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieParquetRealtimeInputFormat.java @@ -18,253 +18,60 @@ package org.apache.hudi.hadoop.realtime; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.FileSlice; -import org.apache.hudi.common.model.HoodieCommitMetadata; -import org.apache.hudi.common.model.HoodieFileGroup; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; -import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.HoodieTableFileSystemView; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; -import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hadoop.BootstrapBaseFileSplit; -import org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile; -import org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile; -import org.apache.hudi.hadoop.RealtimeFileStatus; -import org.apache.hudi.hadoop.PathWithLogFilePath; -import org.apache.hudi.hadoop.HoodieParquetInputFormat; -import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; -import org.apache.hudi.hadoop.UseRecordReaderFromInputFormat; -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; -import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.UseFileSplitsFromInputFormat; +import org.apache.hudi.hadoop.UseRecordReaderFromInputFormat; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; +import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.HashMap; -import java.util.stream.Collectors; - /** * Input Format, that provides a real-time view of data in a Hoodie table. */ @UseRecordReaderFromInputFormat @UseFileSplitsFromInputFormat -public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat implements Configurable { +public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat { private static final Logger LOG = LogManager.getLogger(HoodieParquetRealtimeInputFormat.class); + public HoodieParquetRealtimeInputFormat() { + super(new HoodieMergeOnReadTableInputFormat()); + } + // To make Hive on Spark queries work with RT tables. Our theory is that due to // {@link org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher} // not handling empty list correctly, the ParquetRecordReaderWrapper ends up adding the same column ids multiple // times which ultimately breaks the query. - - @Override - public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - - List fileSplits = Arrays.stream(super.getSplits(job, numSplits)).map(is -> (FileSplit) is).collect(Collectors.toList()); - - boolean isIncrementalSplits = HoodieRealtimeInputFormatUtils.isIncrementalQuerySplits(fileSplits); - - return isIncrementalSplits - ? HoodieRealtimeInputFormatUtils.getIncrementalRealtimeSplits(job, fileSplits.stream()) - : HoodieRealtimeInputFormatUtils.getRealtimeSplits(job, fileSplits.stream()); - } - - /** - * Keep the logic of mor_incr_view as same as spark datasource. - * Step1: Get list of commits to be fetched based on start commit and max commits(for snapshot max commits is -1). - * Step2: Get list of affected files status for these affected file status. - * Step3: Construct HoodieTableFileSystemView based on those affected file status. - * a. Filter affected partitions based on inputPaths. - * b. Get list of fileGroups based on affected partitions by fsView.getAllFileGroups. - * Step4: Set input paths based on filtered affected partition paths. changes that amony original input paths passed to - * this method. some partitions did not have commits as part of the trimmed down list of commits and hence we need this step. - * Step5: Find candidate fileStatus, since when we get baseFileStatus from HoodieTableFileSystemView, - * the BaseFileStatus will missing file size information. - * We should use candidate fileStatus to update the size information for BaseFileStatus. - * Step6: For every file group from step3(b) - * Get 1st available base file from all file slices. then we use candidate file status to update the baseFileStatus, - * and construct RealTimeFileStatus and add it to result along with log files. - * If file group just has log files, construct RealTimeFileStatus and add it to result. - * TODO: unify the incremental view code between hive/spark-sql and spark datasource - */ - @Override - protected List listStatusForIncrementalMode( - JobConf job, HoodieTableMetaClient tableMetaClient, List inputPaths, String incrementalTable) throws IOException { - List result = new ArrayList<>(); - Job jobContext = Job.getInstance(job); - - // step1 - Option timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient); - if (!timeline.isPresent()) { - return result; - } - HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, incrementalTable, timeline.get()); - Option> commitsToCheck = Option.of(commitsTimelineToReturn.getInstants().collect(Collectors.toList())); - if (!commitsToCheck.isPresent()) { - return result; - } - // step2 - commitsToCheck.get().sort(HoodieInstant::compareTo); - List metadataList = commitsToCheck - .get().stream().map(instant -> { - try { - return HoodieInputFormatUtils.getCommitMetadata(instant, commitsTimelineToReturn); - } catch (IOException e) { - throw new HoodieException(String.format("cannot get metadata for instant: %s", instant)); - } - }).collect(Collectors.toList()); - - // build fileGroup from fsView - List affectedFileStatus = Arrays.asList(HoodieInputFormatUtils - .listAffectedFilesForCommits(new Path(tableMetaClient.getBasePath()), metadataList)); - // step3 - HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(tableMetaClient, commitsTimelineToReturn, affectedFileStatus.toArray(new FileStatus[0])); - // build fileGroup from fsView - Path basePath = new Path(tableMetaClient.getBasePath()); - // filter affectedPartition by inputPaths - List affectedPartition = HoodieInputFormatUtils.getWritePartitionPaths(metadataList).stream() - .filter(k -> k.isEmpty() ? inputPaths.contains(basePath) : inputPaths.contains(new Path(basePath, k))).collect(Collectors.toList()); - if (affectedPartition.isEmpty()) { - return result; - } - List fileGroups = affectedPartition.stream() - .flatMap(partitionPath -> fsView.getAllFileGroups(partitionPath)).collect(Collectors.toList()); - // step4 - setInputPaths(job, affectedPartition.stream() - .map(p -> p.isEmpty() ? basePath.toString() : new Path(basePath, p).toString()).collect(Collectors.joining(","))); - - // step5 - // find all file status in partitionPaths. - FileStatus[] fileStatuses = doListStatus(job); - Map candidateFileStatus = new HashMap<>(); - for (int i = 0; i < fileStatuses.length; i++) { - String key = fileStatuses[i].getPath().toString(); - candidateFileStatus.put(key, fileStatuses[i]); - } - - String maxCommitTime = fsView.getLastInstant().get().getTimestamp(); - // step6 - result.addAll(collectAllIncrementalFiles(fileGroups, maxCommitTime, basePath.toString(), candidateFileStatus)); - return result; - } - - private List collectAllIncrementalFiles(List fileGroups, String maxCommitTime, String basePath, Map candidateFileStatus) { - List result = new ArrayList<>(); - fileGroups.stream().forEach(f -> { - try { - List baseFiles = f.getAllFileSlices().filter(slice -> slice.getBaseFile().isPresent()).collect(Collectors.toList()); - if (!baseFiles.isEmpty()) { - FileStatus baseFileStatus = HoodieInputFormatUtils.getFileStatus(baseFiles.get(0).getBaseFile().get()); - String baseFilePath = baseFileStatus.getPath().toUri().toString(); - if (!candidateFileStatus.containsKey(baseFilePath)) { - throw new HoodieException("Error obtaining fileStatus for file: " + baseFilePath); - } - // We cannot use baseFileStatus.getPath() here, since baseFileStatus.getPath() missing file size information. - // So we use candidateFileStatus.get(baseFileStatus.getPath()) to get a correct path. - RealtimeFileStatus fileStatus = new RealtimeFileStatus(candidateFileStatus.get(baseFilePath)); - fileStatus.setMaxCommitTime(maxCommitTime); - fileStatus.setBelongToIncrementalFileStatus(true); - fileStatus.setBasePath(basePath); - fileStatus.setBaseFilePath(baseFilePath); - fileStatus.setDeltaLogFiles(f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList())); - // try to set bootstrapfileStatus - if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) { - fileStatus.setBootStrapFileStatus(baseFileStatus); - } - result.add(fileStatus); - } - // add file group which has only logs. - if (f.getLatestFileSlice().isPresent() && baseFiles.isEmpty()) { - List logFileStatus = f.getLatestFileSlice().get().getLogFiles().map(logFile -> logFile.getFileStatus()).collect(Collectors.toList()); - if (logFileStatus.size() > 0) { - RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0)); - fileStatus.setBelongToIncrementalFileStatus(true); - fileStatus.setDeltaLogFiles(logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList())); - fileStatus.setMaxCommitTime(maxCommitTime); - fileStatus.setBasePath(basePath); - result.add(fileStatus); - } - } - } catch (IOException e) { - throw new HoodieException("Error obtaining data file/log file grouping ", e); - } - }); - return result; - } - - @Override - protected boolean includeLogFilesForSnapShotView() { - return true; - } - - @Override - protected boolean isSplitable(FileSystem fs, Path filename) { - if (filename instanceof PathWithLogFilePath) { - return ((PathWithLogFilePath)filename).splitable(); - } - return super.isSplitable(fs, filename); - } - - // make split for path. - // When query the incremental view, the read files may be bootstrap files, we wrap those bootstrap files into - // PathWithLogFilePath, so those bootstrap files should be processed int this function. - @Override - protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) { - if (file instanceof PathWithLogFilePath) { - return doMakeSplitForPathWithLogFilePath((PathWithLogFilePath) file, start, length, hosts, null); - } - return super.makeSplit(file, start, length, hosts); - } - @Override - protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) { - if (file instanceof PathWithLogFilePath) { - return doMakeSplitForPathWithLogFilePath((PathWithLogFilePath) file, start, length, hosts, inMemoryHosts); - } - return super.makeSplit(file, start, length, hosts, inMemoryHosts); - } + public RecordReader getRecordReader(final InputSplit split, final JobConf jobConf, + final Reporter reporter) throws IOException { + // sanity check + ValidationUtils.checkArgument(split instanceof RealtimeSplit, + "HoodieRealtimeRecordReader can only work on RealtimeSplit and not with " + split); + RealtimeSplit realtimeSplit = (RealtimeSplit) split; + addProjectionToJobConf(realtimeSplit, jobConf); + LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); - private FileSplit doMakeSplitForPathWithLogFilePath(PathWithLogFilePath path, long start, long length, String[] hosts, String[] inMemoryHosts) { - if (!path.includeBootstrapFilePath()) { - return path.buildSplit(path, start, length, hosts); - } else { - FileSplit bf = - inMemoryHosts == null - ? super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts) - : super.makeSplit(path.getPathWithBootstrapFileStatus(), start, length, hosts, inMemoryHosts); - return HoodieRealtimeInputFormatUtils - .createRealtimeBoostrapBaseFileSplit((BootstrapBaseFileSplit) bf, path.getBasePath(), path.getDeltaLogFiles(), path.getMaxCommitTime()); + // for log only split, set the parquet reader as empty. + if (FSUtils.isLogFile(realtimeSplit.getPath())) { + return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, new HoodieEmptyRecordReader(realtimeSplit, jobConf)); } - } - @Override - protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) { - // no specific filtering for Realtime format - return timeline; + return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, + super.getRecordReader(split, jobConf, reporter)); } void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf jobConf) { @@ -287,32 +94,14 @@ void addProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf job // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction // time. if (!realtimeSplit.getDeltaLogPaths().isEmpty()) { - HoodieRealtimeInputFormatUtils.addRequiredProjectionFields(jobConf, realtimeSplit.getHoodieVirtualKeyInfo()); + HoodieRealtimeInputFormatUtils.addRequiredProjectionFields(jobConf, realtimeSplit.getVirtualKeyInfo()); } - this.conf = jobConf; - this.conf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true"); + jobConf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true"); + setConf(jobConf); } } } - HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); - } - @Override - public RecordReader getRecordReader(final InputSplit split, final JobConf jobConf, - final Reporter reporter) throws IOException { - // sanity check - ValidationUtils.checkArgument(split instanceof RealtimeSplit, - "HoodieRealtimeRecordReader can only work on RealtimeSplit and not with " + split); - RealtimeSplit realtimeSplit = (RealtimeSplit) split; - addProjectionToJobConf(realtimeSplit, jobConf); - LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) - + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); - - // for log only split, set the parquet reader as empty. - if (FSUtils.isLogFile(realtimeSplit.getPath())) { - return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, new HoodieEmptyRecordReader(realtimeSplit, jobConf)); - } - return new HoodieRealtimeRecordReader(realtimeSplit, jobConf, - super.getRecordReader(split, jobConf, reporter)); + HoodieRealtimeInputFormatUtils.cleanProjectionColumnIds(jobConf); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeBootstrapBaseFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeBootstrapBaseFileSplit.java new file mode 100644 index 0000000000000..c7022c98ad3cd --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeBootstrapBaseFileSplit.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.BootstrapBaseFileSplit; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Realtime {@link FileSplit} with external base file + * + * NOTE: If you're adding fields here you need to make sure that you appropriately de-/serialize them + * in {@link #readFromInput(DataInput)} and {@link #writeToOutput(DataOutput)} + */ +public class HoodieRealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit implements RealtimeSplit { + /** + * Marks whether this path produced as part of Incremental Query + */ + private boolean belongsToIncrementalQuery = false; + /** + * List of delta log-files holding updated records for this base-file + */ + private List deltaLogFiles = new ArrayList<>(); + /** + * Latest commit instant available at the time of the query in which all of the files + * pertaining to this split are represented + */ + private String maxCommitTime; + /** + * Base path of the table this path belongs to + */ + private String basePath; + /** + * Virtual key configuration of the table this split belongs to + */ + private Option virtualKeyInfo = Option.empty(); + + /** + * NOTE: This ctor is necessary for Hive to be able to serialize and + * then instantiate it when deserializing back + */ + public HoodieRealtimeBootstrapBaseFileSplit() {} + + public HoodieRealtimeBootstrapBaseFileSplit(FileSplit baseSplit, + String basePath, + List deltaLogFiles, + String maxInstantTime, + FileSplit externalFileSplit, + boolean belongsToIncrementalQuery, + Option virtualKeyInfoOpt) throws IOException { + super(baseSplit, externalFileSplit); + this.maxCommitTime = maxInstantTime; + this.deltaLogFiles = deltaLogFiles; + this.basePath = basePath; + this.belongsToIncrementalQuery = belongsToIncrementalQuery; + this.virtualKeyInfo = virtualKeyInfoOpt; + } + + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + writeToOutput(out); + } + + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + readFromInput(in); + } + + @Override + public List getDeltaLogFiles() { + return deltaLogFiles; + } + + @Override + public void setDeltaLogFiles(List deltaLogFiles) { + this.deltaLogFiles = deltaLogFiles; + } + + @Override + public String getMaxCommitTime() { + return maxCommitTime; + } + + @Override + public String getBasePath() { + return basePath; + } + + @Override + public Option getVirtualKeyInfo() { + return virtualKeyInfo; + } + + @Override + public boolean getBelongsToIncrementalQuery() { + return belongsToIncrementalQuery; + } + + @Override + public void setBelongsToIncrementalQuery(boolean belongsToIncrementalPath) { + this.belongsToIncrementalQuery = belongsToIncrementalPath; + } + + @Override + public void setMaxCommitTime(String maxInstantTime) { + this.maxCommitTime = maxInstantTime; + } + + @Override + public void setBasePath(String basePath) { + this.basePath = basePath; + } + + @Override + public void setVirtualKeyInfo(Option virtualKeyInfo) { + this.virtualKeyInfo = virtualKeyInfo; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java index a39ec35507a77..a424f021c2d20 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimeFileSplit.java @@ -18,83 +18,125 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.hadoop.mapred.FileSplit; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; -import org.apache.hadoop.mapred.FileSplit; - import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; /** - * Filesplit that wraps the base split and a list of log files to merge deltas from. + * {@link FileSplit} implementation that holds + *

      + *
    1. Split corresponding to the base file
    2. + *
    3. List of {@link HoodieLogFile} that holds the delta to be merged (upon reading)
    4. + *
    + * + * This split is correspondent to a single file-slice in the Hudi terminology. + * + * NOTE: If you're adding fields here you need to make sure that you appropriately de-/serialize them + * in {@link #readFromInput(DataInput)} and {@link #writeToOutput(DataOutput)} */ public class HoodieRealtimeFileSplit extends FileSplit implements RealtimeSplit { - - private List deltaLogPaths; + /** + * List of delta log-files holding updated records for this base-file + */ private List deltaLogFiles = new ArrayList<>(); - - private String maxCommitTime; - + /** + * Base path of the table this path belongs to + */ private String basePath; - - private Option hoodieVirtualKeyInfo = Option.empty(); - - public HoodieRealtimeFileSplit() { - super(); + /** + * Latest commit instant available at the time of the query in which all of the files + * pertaining to this split are represented + */ + private String maxCommitTime; + /** + * Marks whether this path produced as part of Incremental Query + */ + private boolean belongsToIncrementalQuery = false; + /** + * Virtual key configuration of the table this split belongs to + */ + private Option virtualKeyInfo = Option.empty(); + + public HoodieRealtimeFileSplit() {} + + public HoodieRealtimeFileSplit(FileSplit baseSplit, + HoodieRealtimePath path) + throws IOException { + this(baseSplit, + path.getBasePath(), + path.getDeltaLogFiles(), + path.getMaxCommitTime(), + path.getBelongsToIncrementalQuery(), + path.getVirtualKeyInfo()); } - public HoodieRealtimeFileSplit(FileSplit baseSplit, String basePath, List deltaLogFiles, String maxCommitTime, - Option hoodieVirtualKeyInfo) + /** + * @VisibleInTesting + */ + public HoodieRealtimeFileSplit(FileSplit baseSplit, + String basePath, + List deltaLogFiles, + String maxCommitTime, + boolean belongsToIncrementalQuery, + Option virtualKeyInfo) throws IOException { super(baseSplit.getPath(), baseSplit.getStart(), baseSplit.getLength(), baseSplit.getLocations()); this.deltaLogFiles = deltaLogFiles; - this.deltaLogPaths = deltaLogFiles.stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList()); - this.maxCommitTime = maxCommitTime; this.basePath = basePath; - this.hoodieVirtualKeyInfo = hoodieVirtualKeyInfo; - } - - public List getDeltaLogPaths() { - return deltaLogPaths; + this.maxCommitTime = maxCommitTime; + this.belongsToIncrementalQuery = belongsToIncrementalQuery; + this.virtualKeyInfo = virtualKeyInfo; } public List getDeltaLogFiles() { return deltaLogFiles; } + @Override + public void setDeltaLogFiles(List deltaLogFiles) { + this.deltaLogFiles = deltaLogFiles; + } + public String getMaxCommitTime() { return maxCommitTime; } + public void setMaxCommitTime(String maxCommitTime) { + this.maxCommitTime = maxCommitTime; + } + public String getBasePath() { return basePath; } - @Override - public void setHoodieVirtualKeyInfo(Option hoodieVirtualKeyInfo) { - this.hoodieVirtualKeyInfo = hoodieVirtualKeyInfo; + public void setBasePath(String basePath) { + this.basePath = basePath; } @Override - public Option getHoodieVirtualKeyInfo() { - return hoodieVirtualKeyInfo; + public void setVirtualKeyInfo(Option virtualKeyInfo) { + this.virtualKeyInfo = virtualKeyInfo; } - public void setDeltaLogPaths(List deltaLogPaths) { - this.deltaLogPaths = deltaLogPaths; + @Override + public Option getVirtualKeyInfo() { + return virtualKeyInfo; } - public void setMaxCommitTime(String maxCommitTime) { - this.maxCommitTime = maxCommitTime; + @Override + public boolean getBelongsToIncrementalQuery() { + return belongsToIncrementalQuery; } - public void setBasePath(String basePath) { - this.basePath = basePath; + @Override + public void setBelongsToIncrementalQuery(boolean belongsToIncrementalPath) { + this.belongsToIncrementalQuery = belongsToIncrementalPath; } @Override @@ -111,7 +153,7 @@ public void readFields(DataInput in) throws IOException { @Override public String toString() { - return "HoodieRealtimeFileSplit{DataPath=" + getPath() + ", deltaLogPaths=" + deltaLogPaths + return "HoodieRealtimeFileSplit{DataPath=" + getPath() + ", deltaLogPaths=" + getDeltaLogPaths() + ", maxCommitTime='" + maxCommitTime + '\'' + ", basePath='" + basePath + '\'' + '}'; } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java new file mode 100644 index 0000000000000..bba44d5c6632c --- /dev/null +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.PathWithBootstrapFileStatus; + +import java.util.List; + +/** + * {@link Path} implementation encoding additional information necessary to appropriately read + * base files of the MOR tables, such as list of delta log files (holding updated records) associated + * w/ the base file, etc. + */ +public class HoodieRealtimePath extends Path { + /** + * Marks whether this path produced as part of Incremental Query + */ + private final boolean belongsToIncrementalQuery; + /** + * List of delta log-files holding updated records for this base-file + */ + private final List deltaLogFiles; + /** + * Latest commit instant available at the time of the query in which all of the files + * pertaining to this split are represented + */ + private final String maxCommitTime; + /** + * Base path of the table this path belongs to + */ + private final String basePath; + /** + * Virtual key configuration of the table this split belongs to + */ + private final Option virtualKeyInfo; + /** + * File status for the Bootstrap file (only relevant if this table is a bootstrapped table + */ + private PathWithBootstrapFileStatus pathWithBootstrapFileStatus; + + public HoodieRealtimePath(Path parent, + String child, + String basePath, + List deltaLogFiles, + String maxCommitTime, + boolean belongsToIncrementalQuery, + Option virtualKeyInfo) { + super(parent, child); + this.basePath = basePath; + this.deltaLogFiles = deltaLogFiles; + this.maxCommitTime = maxCommitTime; + this.belongsToIncrementalQuery = belongsToIncrementalQuery; + this.virtualKeyInfo = virtualKeyInfo; + } + + public List getDeltaLogFiles() { + return deltaLogFiles; + } + + public String getMaxCommitTime() { + return maxCommitTime; + } + + public String getBasePath() { + return basePath; + } + + public boolean getBelongsToIncrementalQuery() { + return belongsToIncrementalQuery; + } + + public boolean isSplitable() { + return !toString().isEmpty(); + } + + public PathWithBootstrapFileStatus getPathWithBootstrapFileStatus() { + return pathWithBootstrapFileStatus; + } + + public void setPathWithBootstrapFileStatus(PathWithBootstrapFileStatus pathWithBootstrapFileStatus) { + this.pathWithBootstrapFileStatus = pathWithBootstrapFileStatus; + } + + public boolean includeBootstrapFilePath() { + return pathWithBootstrapFileStatus != null; + } + + public Option getVirtualKeyInfo() { + return virtualKeyInfo; + } +} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeBootstrapBaseFileSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeBootstrapBaseFileSplit.java deleted file mode 100644 index 79d2d815ee809..0000000000000 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeBootstrapBaseFileSplit.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.hadoop.realtime; - -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.hadoop.BootstrapBaseFileSplit; - -import org.apache.hadoop.mapred.FileSplit; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Collectors; - -/** - * Realtime File Split with external base file. - */ -public class RealtimeBootstrapBaseFileSplit extends BootstrapBaseFileSplit implements RealtimeSplit { - - private List deltaLogPaths; - private List deltaLogFiles = new ArrayList<>(); - - private String maxInstantTime; - - private String basePath; - - public RealtimeBootstrapBaseFileSplit() { - super(); - } - - public RealtimeBootstrapBaseFileSplit(FileSplit baseSplit, String basePath, List deltaLogFiles, - String maxInstantTime, FileSplit externalFileSplit) throws IOException { - super(baseSplit, externalFileSplit); - this.maxInstantTime = maxInstantTime; - this.deltaLogFiles = deltaLogFiles; - this.deltaLogPaths = deltaLogFiles.stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList()); - this.basePath = basePath; - } - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - writeToOutput(out); - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - readFromInput(in); - } - - @Override - public List getDeltaLogPaths() { - return deltaLogPaths; - } - - @Override - public List getDeltaLogFiles() { - return deltaLogFiles; - } - - @Override - public String getMaxCommitTime() { - return maxInstantTime; - } - - @Override - public String getBasePath() { - return basePath; - } - - @Override - public Option getHoodieVirtualKeyInfo() { - return Option.empty(); - } - - @Override - public void setDeltaLogPaths(List deltaLogPaths) { - this.deltaLogPaths = deltaLogPaths; - } - - @Override - public void setMaxCommitTime(String maxInstantTime) { - this.maxInstantTime = maxInstantTime; - } - - @Override - public void setBasePath(String basePath) { - this.basePath = basePath; - } - - @Override - public void setHoodieVirtualKeyInfo(Option hoodieVirtualKeyInfo) {} - -} diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java index f00efa5efaaa6..b917f004bcd06 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeCompactedRecordReader.java @@ -18,9 +18,16 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.HoodieCommonConfig; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner; @@ -28,13 +35,6 @@ import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; - -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -54,7 +54,7 @@ class RealtimeCompactedRecordReader extends AbstractRealtimeRecordReader private final Set deltaRecordKeys; private final HoodieMergedLogRecordScanner mergedLogRecordScanner; - private int recordKeyIndex = HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS; + private final int recordKeyIndex; private Iterator deltaItr; public RealtimeCompactedRecordReader(RealtimeSplit split, JobConf job, @@ -64,9 +64,9 @@ public RealtimeCompactedRecordReader(RealtimeSplit split, JobConf job, this.mergedLogRecordScanner = getMergedLogRecordScanner(); this.deltaRecordMap = mergedLogRecordScanner.getRecords(); this.deltaRecordKeys = new HashSet<>(this.deltaRecordMap.keySet()); - if (split.getHoodieVirtualKeyInfo().isPresent()) { - this.recordKeyIndex = split.getHoodieVirtualKeyInfo().get().getRecordKeyFieldIndex(); - } + this.recordKeyIndex = split.getVirtualKeyInfo() + .map(HoodieVirtualKeyInfo::getRecordKeyFieldIndex) + .orElse(HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS); } /** @@ -96,9 +96,9 @@ private HoodieMergedLogRecordScanner getMergedLogRecordScanner() throws IOExcept private Option buildGenericRecordwithCustomPayload(HoodieRecord record) throws IOException { if (usesCustomPayload) { - return record.getData().getInsertValue(getWriterSchema()); + return ((HoodieAvroRecord) record).getData().getInsertValue(getWriterSchema(), payloadProps); } else { - return record.getData().getInsertValue(getReaderSchema()); + return ((HoodieAvroRecord) record).getData().getInsertValue(getReaderSchema(), payloadProps); } } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeSplit.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeSplit.java index a7f0d2cc2f5e7..d9b1923c60f80 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeSplit.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeSplit.java @@ -18,18 +18,18 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.InputSplitWithLocationInfo; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.util.Option; import org.apache.hudi.hadoop.InputSplitUtils; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.InputSplitWithLocationInfo; - import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; /** * Realtime Input Split Interface. @@ -40,10 +40,14 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo { * Return Log File Paths. * @return */ - List getDeltaLogPaths(); + default List getDeltaLogPaths() { + return getDeltaLogFiles().stream().map(entry -> entry.getPath().toString()).collect(Collectors.toList()); + } List getDeltaLogFiles(); + void setDeltaLogFiles(List deltaLogFiles); + /** * Return Max Instant Time. * @return @@ -60,14 +64,12 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo { * Returns Virtual key info if meta fields are disabled. * @return */ - Option getHoodieVirtualKeyInfo(); + Option getVirtualKeyInfo(); /** - * Update Log File Paths. - * - * @param deltaLogPaths + * Returns the flag whether this split belongs to an Incremental Query */ - void setDeltaLogPaths(List deltaLogPaths); + boolean getBelongsToIncrementalQuery(); /** * Update Maximum valid instant time. @@ -81,57 +83,72 @@ public interface RealtimeSplit extends InputSplitWithLocationInfo { */ void setBasePath(String basePath); - void setHoodieVirtualKeyInfo(Option hoodieVirtualKeyInfo); + /** + * Sets the flag whether this split belongs to an Incremental Query + */ + void setBelongsToIncrementalQuery(boolean belongsToIncrementalQuery); + + void setVirtualKeyInfo(Option virtualKeyInfo); default void writeToOutput(DataOutput out) throws IOException { InputSplitUtils.writeString(getBasePath(), out); InputSplitUtils.writeString(getMaxCommitTime(), out); - out.writeInt(getDeltaLogPaths().size()); - for (String logFilePath : getDeltaLogPaths()) { - InputSplitUtils.writeString(logFilePath, out); + InputSplitUtils.writeBoolean(getBelongsToIncrementalQuery(), out); + + out.writeInt(getDeltaLogFiles().size()); + for (HoodieLogFile logFile : getDeltaLogFiles()) { + InputSplitUtils.writeString(logFile.getPath().toString(), out); + out.writeLong(logFile.getFileSize()); } - if (!getHoodieVirtualKeyInfo().isPresent()) { + + Option virtualKeyInfoOpt = getVirtualKeyInfo(); + if (!virtualKeyInfoOpt.isPresent()) { InputSplitUtils.writeBoolean(false, out); } else { InputSplitUtils.writeBoolean(true, out); - InputSplitUtils.writeString(getHoodieVirtualKeyInfo().get().getRecordKeyField(), out); - InputSplitUtils.writeString(getHoodieVirtualKeyInfo().get().getPartitionPathField(), out); - InputSplitUtils.writeString(String.valueOf(getHoodieVirtualKeyInfo().get().getRecordKeyFieldIndex()), out); - InputSplitUtils.writeString(String.valueOf(getHoodieVirtualKeyInfo().get().getPartitionPathFieldIndex()), out); + InputSplitUtils.writeString(virtualKeyInfoOpt.get().getRecordKeyField(), out); + InputSplitUtils.writeString(virtualKeyInfoOpt.get().getPartitionPathField(), out); + InputSplitUtils.writeString(String.valueOf(virtualKeyInfoOpt.get().getRecordKeyFieldIndex()), out); + InputSplitUtils.writeString(String.valueOf(virtualKeyInfoOpt.get().getPartitionPathFieldIndex()), out); } } default void readFromInput(DataInput in) throws IOException { setBasePath(InputSplitUtils.readString(in)); setMaxCommitTime(InputSplitUtils.readString(in)); + setBelongsToIncrementalQuery(InputSplitUtils.readBoolean(in)); + int totalLogFiles = in.readInt(); - List deltaLogPaths = new ArrayList<>(totalLogFiles); + List deltaLogPaths = new ArrayList<>(totalLogFiles); for (int i = 0; i < totalLogFiles; i++) { - deltaLogPaths.add(InputSplitUtils.readString(in)); + String logFilePath = InputSplitUtils.readString(in); + long logFileSize = in.readLong(); + deltaLogPaths.add(new HoodieLogFile(new Path(logFilePath), logFileSize)); } - setDeltaLogPaths(deltaLogPaths); + setDeltaLogFiles(deltaLogPaths); + boolean hoodieVirtualKeyPresent = InputSplitUtils.readBoolean(in); if (hoodieVirtualKeyPresent) { String recordKeyField = InputSplitUtils.readString(in); String partitionPathField = InputSplitUtils.readString(in); int recordFieldIndex = Integer.parseInt(InputSplitUtils.readString(in)); int partitionPathIndex = Integer.parseInt(InputSplitUtils.readString(in)); - setHoodieVirtualKeyInfo(Option.of(new HoodieVirtualKeyInfo(recordKeyField, partitionPathField, recordFieldIndex, partitionPathIndex))); + setVirtualKeyInfo(Option.of(new HoodieVirtualKeyInfo(recordKeyField, partitionPathField, recordFieldIndex, partitionPathIndex))); } } /** * The file containing this split's data. */ - public Path getPath(); + Path getPath(); /** * The position of the first byte in the file to process. */ - public long getStart(); + long getStart(); /** * The number of bytes in the file to process. */ - public long getLength(); + long getLength(); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java index 9f51e7f16137e..84c808865072a 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/RealtimeUnmergedRecordReader.java @@ -18,18 +18,10 @@ package org.apache.hudi.hadoop.realtime; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.table.log.HoodieUnMergedLogRecordScanner; import org.apache.hudi.common.util.DefaultSizeEstimator; +import org.apache.hudi.common.util.Functions; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor; import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer; @@ -40,6 +32,18 @@ import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.function.Function; + class RealtimeUnmergedRecordReader extends AbstractRealtimeRecordReader implements RecordReader { @@ -74,7 +78,7 @@ public RealtimeUnmergedRecordReader(RealtimeSplit split, JobConf job, this.parquetRecordsIterator = new RecordReaderValueIterator<>(this.parquetReader); this.executor = new BoundedInMemoryExecutor<>( HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes(jobConf), getParallelProducers(), - Option.empty(), x -> x, new DefaultSizeEstimator<>()); + Option.empty(), Function.identity(), new DefaultSizeEstimator<>(), Functions.noop()); // Consumer of this record reader this.iterator = this.executor.getQueue().iterator(); this.logRecordScanner = HoodieUnMergedLogRecordScanner.newBuilder() @@ -88,7 +92,7 @@ public RealtimeUnmergedRecordReader(RealtimeSplit split, JobConf job, .withBufferSize(this.jobConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) .withLogRecordScannerCallback(record -> { // convert Hoodie log record to Hadoop AvroWritable and buffer - GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema()).get(); + GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema(), payloadProps).get(); ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(rec, getHiveSchema()); this.executor.getQueue().insertRecord(aWritable); }) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java index b4f7e336335d4..fa2bce4875379 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java @@ -19,15 +19,11 @@ package org.apache.hudi.hadoop.utils; import org.apache.hadoop.conf.Configuration; -import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.util.CollectionUtils; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.exception.HoodieIOException; - import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.Option; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -148,39 +144,6 @@ public static List getIncrementalTableNames(JobContext job) { return result; } - /** - * Depending on the configs hoodie.%s.consume.pending.commits and hoodie.%s.consume.commit of job - * - * (hoodie..consume.pending.commits, hoodie..consume.commit) -> - * (true, validCommit) -> returns activeTimeline filtered until validCommit - * (true, InValidCommit) -> Raises HoodieIOException - * (true, notSet) -> Raises HoodieIOException - * (false, validCommit) -> returns completedTimeline filtered until validCommit - * (false, InValidCommit) -> Raises HoodieIOException - * (false or notSet, notSet) -> returns completedTimeline unfiltered - * - * validCommit is one which exists in the timeline being checked and vice versa - */ - public static HoodieTimeline getTableTimeline(final String tableName, final JobConf job, final HoodieTableMetaClient metaClient) { - HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline(); - - boolean includePendingCommits = shouldIncludePendingCommits(job, tableName); - Option maxCommit = getMaxCommit(job, tableName); - - HoodieTimeline finalizedTimeline = includePendingCommits ? timeline : timeline.filterCompletedInstants(); - - return !maxCommit.isPresent() ? finalizedTimeline : filterIfInstantExists(tableName, finalizedTimeline, maxCommit.get()); - - } - - private static HoodieTimeline filterIfInstantExists(String tableName, HoodieTimeline timeline, String maxCommit) { - if (maxCommit == null || !timeline.containsInstant(maxCommit)) { - LOG.info("Timestamp " + maxCommit + " doesn't exist in the commits timeline:" + timeline + " table: " + tableName); - throw new HoodieIOException("Valid timestamp is required for " + HOODIE_CONSUME_COMMIT + " in snapshot mode"); - } - return timeline.findInstantsBeforeOrEquals(maxCommit); - } - public static boolean isIncrementalUseDatabase(Configuration conf) { return conf.getBoolean(HOODIE_INCREMENTAL_USE_DATABASE, false); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java index a5a3f7e215073..7fec1fb63f6fa 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java @@ -18,47 +18,40 @@ package org.apache.hudi.hadoop.utils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hudi.common.config.HoodieMetadataConfig; -import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodiePartitionMetadata; -import org.apache.hudi.common.model.HoodieBaseFile; -import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.FileSystemViewManager; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.TableFileSystemView; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile; import org.apache.hudi.hadoop.HoodieHFileInputFormat; import org.apache.hudi.hadoop.HoodieParquetInputFormat; -import org.apache.hudi.hadoop.RealtimeFileStatus; import org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile; -import org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile; import org.apache.hudi.hadoop.realtime.HoodieHFileRealtimeInputFormat; import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; -import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; -import org.apache.hadoop.hive.ql.io.orc.OrcSerde; -import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; -import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapreduce.Job; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -318,7 +311,7 @@ public static Map getTableMetaClientByPartitionPath Map metaClientMap = new HashMap<>(); return partitions.stream().collect(Collectors.toMap(Function.identity(), p -> { try { - HoodieTableMetaClient metaClient = getTableMetaClientForBasePath(p.getFileSystem(conf), p); + HoodieTableMetaClient metaClient = getTableMetaClientForBasePathUnchecked(conf, p); metaClientMap.put(p, metaClient); return metaClient; } catch (IOException e) { @@ -328,20 +321,17 @@ public static Map getTableMetaClientByPartitionPath } /** - * Extract HoodieTableMetaClient from a partition path(not base path). - * @param fs - * @param dataPath - * @return - * @throws IOException + * Extract HoodieTableMetaClient from a partition path (not base path) */ - public static HoodieTableMetaClient getTableMetaClientForBasePath(FileSystem fs, Path dataPath) throws IOException { + public static HoodieTableMetaClient getTableMetaClientForBasePathUnchecked(Configuration conf, Path partitionPath) throws IOException { + FileSystem fs = partitionPath.getFileSystem(conf); int levels = HoodieHiveUtils.DEFAULT_LEVELS_TO_BASEPATH; - if (HoodiePartitionMetadata.hasPartitionMetadata(fs, dataPath)) { - HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, dataPath); + if (HoodiePartitionMetadata.hasPartitionMetadata(fs, partitionPath)) { + HoodiePartitionMetadata metadata = new HoodiePartitionMetadata(fs, partitionPath); metadata.readFromFS(); levels = metadata.getPartitionDepth(); } - Path baseDir = HoodieHiveUtils.getNthParent(dataPath, levels); + Path baseDir = HoodieHiveUtils.getNthParent(partitionPath, levels); LOG.info("Reading hoodie metadata from path " + baseDir.toString()); return HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(baseDir.toString()).build(); } @@ -440,67 +430,6 @@ public static HoodieMetadataConfig buildMetadataConfig(Configuration conf) { .build(); } - public static List filterFileStatusForSnapshotMode(JobConf job, Map tableMetaClientMap, - List snapshotPaths, boolean includeLogFiles) throws IOException { - HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(job); - List returns = new ArrayList<>(); - - Map> groupedPaths = - HoodieInputFormatUtils.groupSnapshotPathsByMetaClient(tableMetaClientMap.values(), snapshotPaths); - - Map fsViewCache = new HashMap<>(); - - LOG.info("Found a total of " + groupedPaths.size() + " groups"); - - try { - for (Map.Entry> entry : groupedPaths.entrySet()) { - HoodieTableMetaClient metaClient = entry.getKey(); - if (LOG.isDebugEnabled()) { - LOG.debug("Hoodie Metadata initialized with completed commit instant as :" + metaClient); - } - - HoodieTimeline timeline = HoodieHiveUtils.getTableTimeline(metaClient.getTableConfig().getTableName(), job, metaClient); - - HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(metaClient, tableMetaClient -> - FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, tableMetaClient, buildMetadataConfig(job), timeline)); - List filteredBaseFiles = new ArrayList<>(); - Map> filteredLogs = new HashMap<>(); - for (Path p : entry.getValue()) { - String relativePartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), p); - List matched = fsView.getLatestBaseFiles(relativePartitionPath).collect(Collectors.toList()); - filteredBaseFiles.addAll(matched); - if (includeLogFiles) { - List logMatched = fsView.getLatestFileSlices(relativePartitionPath) - .filter(f -> !f.getBaseFile().isPresent() && f.getLatestLogFile().isPresent()) - .collect(Collectors.toList()); - logMatched.forEach(f -> { - List logPathSizePairs = f.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); - filteredLogs.put(f.getLatestLogFile().get().getFileStatus(), logPathSizePairs); - }); - } - } - - LOG.info("Total paths to process after hoodie filter " + filteredBaseFiles.size()); - for (HoodieBaseFile filteredFile : filteredBaseFiles) { - if (LOG.isDebugEnabled()) { - LOG.debug("Processing latest hoodie file - " + filteredFile.getPath()); - } - filteredFile = refreshFileStatus(job, filteredFile); - returns.add(getFileStatus(filteredFile)); - } - - for (Map.Entry> filterLogEntry : filteredLogs.entrySet()) { - RealtimeFileStatus rs = new RealtimeFileStatus(filterLogEntry.getKey()); - rs.setDeltaLogFiles(filterLogEntry.getValue()); - returns.add(rs); - } - } - } finally { - fsViewCache.forEach(((metaClient, fsView) -> fsView.close())); - } - return returns; - } - /** * Checks the file status for a race condition which can set the file size to 0. 1. HiveInputFormat does * super.listStatus() and gets back a FileStatus[] 2. Then it creates the HoodieTableMetaClient for the paths listed. @@ -534,12 +463,12 @@ private static HoodieBaseFile refreshFileStatus(Configuration conf, HoodieBaseFi * * @return the affected file status array */ - public static FileStatus[] listAffectedFilesForCommits(Path basePath, List metadataList) { + public static FileStatus[] listAffectedFilesForCommits(Configuration hadoopConf, Path basePath, List metadataList) { // TODO: Use HoodieMetaTable to extract affected file directly. HashMap fullPathToFileStatus = new HashMap<>(); // Iterate through the given commits. for (HoodieCommitMetadata metadata: metadataList) { - fullPathToFileStatus.putAll(metadata.getFullPathToFileStatus(basePath.toString())); + fullPathToFileStatus.putAll(metadata.getFullPathToFileStatus(hadoopConf, basePath.toString())); } return fullPathToFileStatus.values().toArray(new FileStatus[0]); } diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java index 6718642d22728..d2501ee8dc15e 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java @@ -18,46 +18,30 @@ package org.apache.hudi.hadoop.utils; -import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; -import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; -import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.table.timeline.HoodieTimeline; -import org.apache.hudi.common.table.view.FileSystemViewManager; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; -import org.apache.hudi.common.util.CollectionUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.exception.HoodieIOException; -import org.apache.hudi.hadoop.BaseFileWithLogsSplit; -import org.apache.hudi.hadoop.BootstrapBaseFileSplit; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit; import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; import org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo; -import org.apache.hudi.hadoop.realtime.RealtimeBootstrapBaseFileSplit; import org.apache.hudi.hadoop.realtime.RealtimeSplit; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.SplitLocationInfo; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.parquet.schema.MessageType; -import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -65,169 +49,32 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.TypeUtils.unsafeCast; + public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils { private static final Logger LOG = LogManager.getLogger(HoodieRealtimeInputFormatUtils.class); - public static InputSplit[] getRealtimeSplits(Configuration conf, Stream fileSplits) { - Map> partitionsToParquetSplits = - fileSplits.collect(Collectors.groupingBy(split -> split.getPath().getParent())); - // TODO(vc): Should we handle also non-hoodie splits here? - Map partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionsToParquetSplits.keySet()); - - // Create file system cache so metadata table is only instantiated once. Also can benefit normal file listing if - // partition path is listed twice so file groups will already be loaded in file system - Map fsCache = new HashMap<>(); - // for all unique split parents, obtain all delta files based on delta commit timeline, - // grouped on file id - List rtSplits = new ArrayList<>(); - try { - // Pre process tableConfig from first partition to fetch virtual key info - Option hoodieVirtualKeyInfo = Option.empty(); - if (partitionsToParquetSplits.size() > 0) { - HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionsToParquetSplits.keySet().iterator().next()); - hoodieVirtualKeyInfo = getHoodieVirtualKeyInfo(metaClient); - } - Option finalHoodieVirtualKeyInfo = hoodieVirtualKeyInfo; - partitionsToParquetSplits.keySet().forEach(partitionPath -> { - // for each partition path obtain the data & log file groupings, then map back to inputsplits - HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath); - if (!fsCache.containsKey(metaClient)) { - HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(conf); - HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, - metaClient, HoodieInputFormatUtils.buildMetadataConfig(conf), metaClient.getActiveTimeline()); - fsCache.put(metaClient, fsView); - } - HoodieTableFileSystemView fsView = fsCache.get(metaClient); - - String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath); - // Both commit and delta-commits are included - pick the latest completed one - Option latestCompletedInstant = - metaClient.getActiveTimeline().getWriteTimeline().filterCompletedInstants().lastInstant(); - - Stream latestFileSlices = latestCompletedInstant - .map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())) - .orElse(Stream.empty()); - - // subgroup splits again by file id & match with log files. - Map> groupedInputSplits = partitionsToParquetSplits.get(partitionPath).stream() - .collect(Collectors.groupingBy(split -> FSUtils.getFileIdFromFilePath(split.getPath()))); - // Get the maxCommit from the last delta or compaction or commit - when bootstrapped from COW table - String maxCommitTime = metaClient.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, - HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)) - .filterCompletedInstants().lastInstant().get().getTimestamp(); - latestFileSlices.forEach(fileSlice -> { - List dataFileSplits = groupedInputSplits.getOrDefault(fileSlice.getFileId(), new ArrayList<>()); - dataFileSplits.forEach(split -> { - try { - List logFiles = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) - .collect(Collectors.toList()); - if (split instanceof BootstrapBaseFileSplit) { - BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split; - rtSplits.add(createRealtimeBoostrapBaseFileSplit(eSplit, metaClient.getBasePath(), logFiles, maxCommitTime)); - } else { - rtSplits.add(new HoodieRealtimeFileSplit(split, metaClient.getBasePath(), logFiles, maxCommitTime, finalHoodieVirtualKeyInfo)); - } - } catch (IOException e) { - throw new HoodieIOException("Error creating hoodie real time split ", e); - } - }); - }); - }); - } catch (Exception e) { - throw new HoodieException("Error obtaining data file/log file grouping ", e); - } finally { - // close all the open fs views. - fsCache.forEach((k, view) -> view.close()); - } - LOG.info("Returning a total splits of " + rtSplits.size()); - return rtSplits.toArray(new InputSplit[0]); - } - - // get IncrementalRealtimeSplits - public static InputSplit[] getIncrementalRealtimeSplits(Configuration conf, Stream fileSplits) throws IOException { - List rtSplits = new ArrayList<>(); - List fileSplitList = fileSplits.collect(Collectors.toList()); - Set partitionSet = fileSplitList.stream().map(f -> f.getPath().getParent()).collect(Collectors.toSet()); - Map partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet); - // Pre process tableConfig from first partition to fetch virtual key info - Option hoodieVirtualKeyInfo = Option.empty(); - if (partitionSet.size() > 0) { - hoodieVirtualKeyInfo = getHoodieVirtualKeyInfo(partitionsToMetaClient.get(partitionSet.iterator().next())); - } - Option finalHoodieVirtualKeyInfo = hoodieVirtualKeyInfo; - fileSplitList.stream().forEach(s -> { - // deal with incremental query. - try { - if (s instanceof BaseFileWithLogsSplit) { - BaseFileWithLogsSplit bs = (BaseFileWithLogsSplit)s; - if (bs.getBelongToIncrementalSplit()) { - rtSplits.add(new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogFiles(), bs.getMaxCommitTime(), finalHoodieVirtualKeyInfo)); - } - } else if (s instanceof RealtimeBootstrapBaseFileSplit) { - rtSplits.add(s); - } - } catch (IOException e) { - throw new HoodieIOException("Error creating hoodie real time split ", e); - } - }); - LOG.info("Returning a total splits of " + rtSplits.size()); - return rtSplits.toArray(new InputSplit[0]); - } - - public static Option getHoodieVirtualKeyInfo(HoodieTableMetaClient metaClient) { - HoodieTableConfig tableConfig = metaClient.getTableConfig(); - if (!tableConfig.populateMetaFields()) { - TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(metaClient); - try { - MessageType parquetSchema = tableSchemaResolver.getTableParquetSchema(); - return Option.of(new HoodieVirtualKeyInfo(tableConfig.getRecordKeyFieldProp(), - tableConfig.getPartitionFieldProp(), parquetSchema.getFieldIndex(tableConfig.getRecordKeyFieldProp()), - parquetSchema.getFieldIndex(tableConfig.getPartitionFieldProp()))); - } catch (Exception exception) { - throw new HoodieException("Fetching table schema failed with exception ", exception); - } + public static boolean doesBelongToIncrementalQuery(FileSplit s) { + if (s instanceof HoodieRealtimeFileSplit) { + HoodieRealtimeFileSplit bs = unsafeCast(s); + return bs.getBelongsToIncrementalQuery(); + } else if (s instanceof HoodieRealtimeBootstrapBaseFileSplit) { + HoodieRealtimeBootstrapBaseFileSplit bs = unsafeCast(s); + return bs.getBelongsToIncrementalQuery(); } - return Option.empty(); - } - public static boolean isIncrementalQuerySplits(List fileSplits) { - if (fileSplits == null || fileSplits.size() == 0) { - return false; - } - return fileSplits.stream().anyMatch(s -> { - if (s instanceof BaseFileWithLogsSplit) { - BaseFileWithLogsSplit bs = (BaseFileWithLogsSplit)s; - return bs.getBelongToIncrementalSplit(); - } else { - return s instanceof RealtimeBootstrapBaseFileSplit; - } - }); - } - - public static RealtimeBootstrapBaseFileSplit createRealtimeBoostrapBaseFileSplit( - BootstrapBaseFileSplit split, String basePath, List logFiles, String maxInstantTime) { - try { - String[] hosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo()) - .filter(x -> !x.isInMemory()).toArray(String[]::new) : new String[0]; - String[] inMemoryHosts = split.getLocationInfo() != null ? Arrays.stream(split.getLocationInfo()) - .filter(SplitLocationInfo::isInMemory).toArray(String[]::new) : new String[0]; - FileSplit baseSplit = new FileSplit(split.getPath(), split.getStart(), split.getLength(), - hosts, inMemoryHosts); - return new RealtimeBootstrapBaseFileSplit(baseSplit, basePath, logFiles, maxInstantTime, split.getBootstrapFileSplit()); - } catch (IOException e) { - throw new HoodieIOException("Error creating hoodie real time split ", e); - } + return false; } // Return parquet file with a list of log files in the same file group. - public static List, List>> groupLogsByBaseFile(Configuration conf, List partitionPaths) { + public static List, List>> groupLogsByBaseFile(Configuration conf, List partitionPaths) { Set partitionSet = new HashSet<>(partitionPaths); // TODO(vc): Should we handle also non-hoodie splits here? Map partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet); // Get all the base file and it's log files pairs in required partition paths. - List, List>> baseAndLogsList = new ArrayList<>(); + List, List>> baseAndLogsList = new ArrayList<>(); partitionSet.forEach(partitionPath -> { // for each partition path obtain the data & log file groupings, then map back to inputsplits HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath); @@ -244,8 +91,7 @@ public static List, List>> groupLogsByBaseFi .orElse(Stream.empty()); latestFileSlices.forEach(fileSlice -> { - List logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) - .map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()); + List logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList()); baseAndLogsList.add(Pair.of(fileSlice.getBaseFile(), logFilePaths)); }); } catch (Exception e) { @@ -312,7 +158,7 @@ public static boolean requiredProjectionFieldsExistInConf(Configuration configur public static boolean canAddProjectionToJobConf(final RealtimeSplit realtimeSplit, final JobConf jobConf) { return jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null - || (!realtimeSplit.getDeltaLogPaths().isEmpty() && !HoodieRealtimeInputFormatUtils.requiredProjectionFieldsExistInConf(jobConf, realtimeSplit.getHoodieVirtualKeyInfo())); + || (!realtimeSplit.getDeltaLogPaths().isEmpty() && !HoodieRealtimeInputFormatUtils.requiredProjectionFieldsExistInConf(jobConf, realtimeSplit.getVirtualKeyInfo())); } /** diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java index 5c7a1fdf2f84f..2ae7c36d98e7e 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieParquetInputFormat.java @@ -18,6 +18,15 @@ package org.apache.hudi.hadoop; +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapreduce.Job; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -34,16 +43,7 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; - -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapreduce.Job; +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -115,7 +115,7 @@ public void testPendingCompactionWithActiveCommits() throws IOException { timeline.setInstants(instants); // Verify getCommitsTimelineBeforePendingCompaction does not return instants after first compaction instant - HoodieTimeline filteredTimeline = inputFormat.filterInstantsTimeline(timeline); + HoodieTimeline filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline); assertTrue(filteredTimeline.containsInstant(t1)); assertTrue(filteredTimeline.containsInstant(t2)); assertFalse(filteredTimeline.containsInstant(t3)); @@ -126,7 +126,7 @@ public void testPendingCompactionWithActiveCommits() throws IOException { instants.remove(t3); timeline = new HoodieActiveTimeline(metaClient); timeline.setInstants(instants); - filteredTimeline = inputFormat.filterInstantsTimeline(timeline); + filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline); // verify all remaining instants are returned. assertTrue(filteredTimeline.containsInstant(t1)); @@ -140,7 +140,7 @@ public void testPendingCompactionWithActiveCommits() throws IOException { instants.remove(t5); timeline = new HoodieActiveTimeline(metaClient); timeline.setInstants(instants); - filteredTimeline = inputFormat.filterInstantsTimeline(timeline); + filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline); // verify all remaining instants are returned. assertTrue(filteredTimeline.containsInstant(t1)); @@ -202,11 +202,11 @@ public void testSnapshotWithInvalidCommitShouldThrowException() throws IOExcepti FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); InputFormatTestUtil.setupSnapshotIncludePendingCommits(jobConf, "1"); Exception exception = assertThrows(HoodieIOException.class, () -> inputFormat.listStatus(jobConf)); - assertEquals("Valid timestamp is required for hoodie.%s.consume.commit in snapshot mode", exception.getMessage()); + assertEquals("Query instant (1) not found in the timeline", exception.getMessage()); InputFormatTestUtil.setupSnapshotMaxCommitTimeQueryMode(jobConf, "1"); exception = assertThrows(HoodieIOException.class, () -> inputFormat.listStatus(jobConf)); - assertEquals("Valid timestamp is required for hoodie.%s.consume.commit in snapshot mode", exception.getMessage()); + assertEquals("Query instant (1) not found in the timeline", exception.getMessage()); } @Test diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/functional/TestHoodieCombineHiveInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/functional/TestHoodieCombineHiveInputFormat.java index 50c3f2e1c4e88..ec6ea0a8b3ec3 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/functional/TestHoodieCombineHiveInputFormat.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/functional/TestHoodieCombineHiveInputFormat.java @@ -104,7 +104,7 @@ public void multiPartitionReadersRealtimeCombineHoodieInputFormat() throws Excep final int numRecords = 1000; // Create 3 partitions, each partition holds one parquet file and 1000 records List partitionDirs = InputFormatTestUtil - .prepareMultiPartitionedParquetTable(tempDir, schema, 3, numRecords, commitTime); + .prepareMultiPartitionedParquetTable(tempDir, schema, 3, numRecords, commitTime, HoodieTableType.MERGE_ON_READ); InputFormatTestUtil.commit(tempDir, commitTime); TableDesc tblDesc = Utilities.defaultTd; @@ -245,7 +245,7 @@ public void multiLevelPartitionReadersRealtimeCombineHoodieInputFormat() throws } @Test - public void testMutilReaderRealtimeComineHoodieInputFormat() throws Exception { + public void testMultiReaderRealtimeCombineHoodieInputFormat() throws Exception { // test for hudi-1722 Configuration conf = new Configuration(); // initial commit diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeFileSplit.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeFileSplit.java index 9d3855c47d663..a6ca32769cf8d 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeFileSplit.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeFileSplit.java @@ -18,12 +18,11 @@ package org.apache.hudi.hadoop.realtime; -import org.apache.hudi.common.model.HoodieLogFile; -import org.apache.hudi.common.util.Option; - import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileSplit; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.util.Option; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -72,7 +71,7 @@ public void setUp(@TempDir java.nio.file.Path tempDir) throws Exception { baseFileSplit = new FileSplit(new Path(fileSplitName), 0, 100, new String[] {}); maxCommitTime = "10001"; - split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogFiles, maxCommitTime, Option.empty()); + split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogFiles, maxCommitTime, false, Option.empty()); } @Test diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java index ede76dc3490fa..fc4eb7ce2c042 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieRealtimeRecordReader.java @@ -18,8 +18,28 @@ package org.apache.hudi.hadoop.realtime; +import org.apache.avro.Schema; +import org.apache.avro.Schema.Field; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.io.ArrayWritable; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieCompactionPlan; @@ -30,8 +50,8 @@ import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; -import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.model.HoodieWriteStat; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.log.HoodieLogFormat; import org.apache.hudi.common.table.log.HoodieLogFormat.Writer; import org.apache.hudi.common.table.log.block.HoodieLogBlock; @@ -44,32 +64,9 @@ import org.apache.hudi.common.util.collection.ExternalSpillableMap; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; -import org.apache.hudi.hadoop.BaseFileWithLogsSplit; -import org.apache.hudi.hadoop.PathWithLogFilePath; import org.apache.hudi.hadoop.RealtimeFileStatus; -import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; import org.apache.hudi.hadoop.config.HoodieRealtimeConfig; - -import org.apache.avro.Schema; -import org.apache.avro.Schema.Field; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; -import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; -import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; -import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.BooleanWritable; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.FileSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; +import org.apache.hudi.hadoop.testutils.InputFormatTestUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -84,12 +81,12 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Properties; import java.util.Set; -import java.util.Map; -import java.util.HashMap; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -163,6 +160,12 @@ public void testHFileInlineReader() throws Exception { HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK); } + @Test + public void testParquetInlineReader() throws Exception { + testReaderInternal(ExternalSpillableMap.DiskMapType.BITCASK, false, false, + HoodieLogBlock.HoodieLogBlockType.PARQUET_DATA_BLOCK); + } + private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled, boolean partitioned) throws Exception { @@ -223,7 +226,9 @@ private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType, new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, baseJobConf), basePath.toUri().toString(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) .collect(Collectors.toList()), - instantTime, Option.empty()); + instantTime, + false, + Option.empty()); // create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat().getRecordReader( @@ -303,7 +308,7 @@ public void testUnMergedReader() throws Exception { // create a split with baseFile (parquet file written earlier) and new log file(s) HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf), - basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, Option.empty()); + basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, false, Option.empty()); // create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat().getRecordReader( @@ -382,7 +387,7 @@ public void testReaderWithNestedAndComplexSchema(ExternalSpillableMap.DiskMapTyp // create a split with baseFile (parquet file written earlier) and new log file(s) HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf), - basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, Option.empty()); + basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, false, Option.empty()); // create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat().getRecordReader( @@ -529,7 +534,7 @@ public void testSchemaEvolutionAndRollbackBlockInLastLogFile(ExternalSpillableMa // create a split with baseFile (parquet file written earlier) and new log file(s) HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf), - basePath.toUri().toString(), logFiles, newCommitTime, Option.empty()); + basePath.toUri().toString(), logFiles, newCommitTime, false, Option.empty()); // create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat().getRecordReader( @@ -609,7 +614,7 @@ public void testIncrementalWithOnlylog() throws Exception { HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat(); inputFormat.setConf(baseJobConf); InputSplit[] splits = inputFormat.getSplits(baseJobConf, 1); - assertTrue(splits.length == 1); + assertEquals(1, splits.length); JobConf newJobConf = new JobConf(baseJobConf); List fields = schema.getFields(); setHiveColumnNameProps(fields, newJobConf, false); @@ -763,13 +768,16 @@ public void testLogOnlyReader() throws Exception { FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime); // create a split with new log file(s) fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size)); - RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath())); + RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus( + new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()), + basePath.toString(), + fileSlice.getLogFiles().collect(Collectors.toList()), + false, + Option.empty()); realtimeFileStatus.setMaxCommitTime(instantTime); - realtimeFileStatus.setBasePath(basePath.toString()); - realtimeFileStatus.setDeltaLogFiles(fileSlice.getLogFiles().collect(Collectors.toList())); - PathWithLogFilePath pathWithLogFileStatus = (PathWithLogFilePath) realtimeFileStatus.getPath(); - BaseFileWithLogsSplit bs = pathWithLogFileStatus.buildSplit(pathWithLogFileStatus, 0, 0, new String[] {""}); - HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(bs, bs.getBasePath(), bs.getDeltaLogFiles(), bs.getMaxCommitTime(), Option.empty()); + HoodieRealtimePath realtimePath = (HoodieRealtimePath) realtimeFileStatus.getPath(); + HoodieRealtimeFileSplit split = + new HoodieRealtimeFileSplit(new FileSplit(realtimePath, 0, 0, new String[] {""}), realtimePath); JobConf newJobConf = new JobConf(baseJobConf); List fields = schema.getFields(); diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java index 8c19524a2d651..836ad57121bd5 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/testutils/InputFormatTestUtil.java @@ -20,6 +20,7 @@ import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.RawLocalFileSystem; +import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieFileFormat; @@ -33,6 +34,7 @@ import org.apache.hudi.common.table.log.block.HoodieDataBlock; import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock; import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.SchemaTestUtil; import org.apache.hudi.hadoop.utils.HoodieHiveUtils; @@ -47,6 +49,7 @@ import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.mapred.JobConf; import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; import java.io.File; import java.io.IOException; @@ -243,9 +246,9 @@ public static File prepareNonPartitionedParquetTable(java.nio.file.Path basePath } public static List prepareMultiPartitionedParquetTable(java.nio.file.Path basePath, Schema schema, - int numberPartitions, int numberOfRecordsPerPartition, String commitNumber) throws IOException { + int numberPartitions, int numberOfRecordsPerPartition, String commitNumber, HoodieTableType tableType) throws IOException { List result = new ArrayList<>(); - HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString()); + HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), tableType, HoodieFileFormat.PARQUET); for (int i = 0; i < numberPartitions; i++) { java.nio.file.Path partitionPath = basePath.resolve(Paths.get(2016 + i + "", "05", "01")); setupPartition(basePath, partitionPath); @@ -363,8 +366,14 @@ public static HoodieLogFormat.Writer writeDataBlockToLogFile(File partitionDir, Map header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, newCommit); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchema.toString()); - HoodieDataBlock dataBlock = (logBlockType == HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK) ? new HoodieHFileDataBlock(records, header) : - new HoodieAvroDataBlock(records, header); + HoodieDataBlock dataBlock = null; + if (logBlockType == HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK) { + dataBlock = new HoodieHFileDataBlock(records, header, Compression.Algorithm.GZ); + } else if (logBlockType == HoodieLogBlock.HoodieLogBlockType.PARQUET_DATA_BLOCK) { + dataBlock = new HoodieParquetDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD, CompressionCodecName.GZIP); + } else { + dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); + } writer.appendBlock(dataBlock); return writer; } diff --git a/hudi-integ-test/README.md b/hudi-integ-test/README.md index ffdedf849298e..7ee4598ba3bcb 100644 --- a/hudi-integ-test/README.md +++ b/hudi-integ-test/README.md @@ -82,8 +82,8 @@ spark-submit 2.YAML file -Choose to write up the entire DAG of operations in YAML, take a look at `complex-dag-cow.yaml` or -`complex-dag-mor.yaml`. +Choose to write up the entire DAG of operations in YAML, take a look at `simple-deltastreamer.yaml` or +`simple-deltastreamer.yaml`. Once you're ready with the DAG you want to execute, simply pass the yaml file path as follows: ``` @@ -177,7 +177,7 @@ cd /opt Copy the integration tests jar into the docker container ``` -docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt +docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt ``` ``` @@ -217,7 +217,7 @@ spark-submit \ --conf spark.driver.extraClassPath=/var/demo/jars/* \ --conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ @@ -227,7 +227,7 @@ spark-submit \ --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ ---workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-cow.yaml \ +--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type COPY_ON_WRITE \ --compact-scheduling-minshare 1 \ @@ -264,7 +264,7 @@ spark-submit \ --conf spark.driver.extraClassPath=/var/demo/jars/* \ --conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ @@ -274,7 +274,7 @@ spark-submit \ --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ ---workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/complex-dag-mor.yaml \ +--workload-yaml-path file:/var/hoodie/ws/docker/demo/config/test-suite/simple-deltastreamer.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type MERGE_ON_READ \ --compact-scheduling-minshare 1 \ @@ -308,16 +308,16 @@ contents both via spark datasource and hive table via spark sql engine. Hive val If you have "ValidateDatasetNode" in your dag, do not replace hive jars as instructed above. Spark sql engine does not go well w/ hive2* jars. So, after running docker setup, follow the below steps. ``` -docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar adhoc-2:/opt/ +docker cp packaging/hudi-integ-test-bundle/target/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar adhoc-2:/opt/ docker cp docker/demo/config/test-suite/test.properties adhoc-2:/opt/ ``` Also copy your dag of interest to adhoc-2:/opt/ ``` -docker cp docker/demo/config/test-suite/complex-dag-cow.yaml adhoc-2:/opt/ +docker cp docker/demo/config/test-suite/simple-deltastreamer.yaml adhoc-2:/opt/ ``` For repeated runs, two additional configs need to be set. "dag_rounds" and "dag_intermittent_delay_mins". -This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: complex-dag-cow.yaml +This means that your dag will be repeated for N times w/ a delay of Y mins between each round. Note: simple-deltastreamer.yaml already has all these configs set. So no changes required just to try it out. Also, ValidateDatasetNode can be configured in two ways. Either with "delete_input_data" set to true or without @@ -457,7 +457,7 @@ spark-submit \ --conf spark.driver.extraClassPath=/var/demo/jars/* \ --conf spark.executor.extraClassPath=/var/demo/jars/* \ --class org.apache.hudi.integ.testsuite.HoodieTestSuiteJob \ -/opt/hudi-integ-test-bundle-0.10.0-SNAPSHOT.jar \ +/opt/hudi-integ-test-bundle-0.11.0-SNAPSHOT.jar \ --source-ordering-field test_suite_source_ordering_field \ --use-deltastreamer \ --target-base-path /user/hive/warehouse/hudi-integ-test-suite/output \ @@ -467,7 +467,7 @@ spark-submit \ --schemaprovider-class org.apache.hudi.integ.testsuite.schema.TestSuiteFileBasedSchemaProvider \ --source-class org.apache.hudi.utilities.sources.AvroDFSSource \ --input-file-size 125829120 \ ---workload-yaml-path file:/opt/complex-dag-cow.yaml \ +--workload-yaml-path file:/opt/simple-deltastreamer.yaml \ --workload-generator-classname org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator \ --table-type COPY_ON_WRITE \ --compact-scheduling-minshare 1 \ @@ -486,8 +486,8 @@ If you wish to enable metrics add below properties as well Few ready to use dags are available under docker/demo/config/test-suite/ that could give you an idea for long running dags. ``` -complex-dag-cow.yaml: simple 1 round dag for COW table. -complex-dag-mor.yaml: simple 1 round dag for MOR table. +simple-deltastreamer.yaml: simple 1 round dag for COW table. +simple-deltastreamer.yaml: simple 1 round dag for MOR table. cow-clustering-example.yaml : dag with 3 rounds, in which inline clustering will trigger during 2nd iteration. cow-long-running-example.yaml : long running dag with 50 iterations. only 1 partition is used. cow-long-running-multi-partitions.yaml: long running dag wit 50 iterations with multiple partitions. diff --git a/hudi-integ-test/pom.xml b/hudi-integ-test/pom.xml index 7ca976f9f80a6..08affb5e48dee 100644 --- a/hudi-integ-test/pom.xml +++ b/hudi-integ-test/pom.xml @@ -363,7 +363,6 @@ org.awaitility awaitility - 3.1.2 test diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java index 41ef3f4ab968c..a98c7f2aec3f0 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/HoodieTestSuiteWriter.java @@ -18,16 +18,15 @@ package org.apache.hudi.integ.testsuite; -import java.io.IOException; -import java.io.Serializable; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieAvroRecord; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.util.Option; @@ -36,6 +35,7 @@ import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodiePayloadConfig; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig; import org.apache.hudi.integ.testsuite.dag.nodes.CleanNode; @@ -43,8 +43,13 @@ import org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode; import org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode; import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats; +import org.apache.hudi.table.HoodieSparkTable; +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.compact.CompactHelpers; import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -52,6 +57,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.Serializable; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -128,7 +135,7 @@ public RDD getNextBatch() throws Exception { Pair>> nextBatch = fetchSource(); lastCheckpoint = Option.of(nextBatch.getValue().getLeft()); JavaRDD inputRDD = nextBatch.getRight().getRight(); - return inputRDD.map(r -> (GenericRecord) r.getData() + return inputRDD.map(r -> (GenericRecord) ((HoodieAvroRecord) r).getData() .getInsertValue(new Schema.Parser().parse(schema)).get()).rdd(); } @@ -214,7 +221,8 @@ public JavaRDD compact(Option instantTime) throws Exception } } if (instantTime.isPresent()) { - return (JavaRDD) writeClient.compact(instantTime.get()); + HoodieWriteMetadata> compactionMetadata = writeClient.compact(instantTime.get()); + return compactionMetadata.getWriteStatuses(); } else { return null; } @@ -271,7 +279,9 @@ public void commitCompaction(JavaRDD records, JavaRDD s.getFilePath()).collect().get(0)); } - writeClient.commitCompaction(instantTime.get(), records, Option.of(extraMetadata)); + HoodieSparkTable table = HoodieSparkTable.create(writeClient.getConfig(), writeClient.getEngineContext()); + HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instantTime.get(), HoodieJavaRDD.of(records), writeClient.getConfig().getSchema()); + writeClient.commitCompaction(instantTime.get(), metadata, Option.of(extraMetadata)); } } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java index b0ae06b6039d4..2c39f5f93a52c 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/configuration/DeltaConfig.java @@ -95,6 +95,9 @@ public static class Config { private static String SCHEMA_VERSION = "schema_version"; private static String NUM_ROLLBACKS = "num_rollbacks"; private static String ENABLE_ROW_WRITING = "enable_row_writing"; + private static String ENABLE_METADATA_VALIDATE = "enable_metadata_validate"; + private static String VALIDATE_FULL_DATA = "validate_full_data"; + private static String DELETE_INPUT_DATA_EXCEPT_LATEST = "delete_input_data_except_latest"; // Spark SQL Create Table private static String TABLE_TYPE = "table_type"; @@ -149,6 +152,10 @@ public int getRecordSize() { return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString()); } + public boolean isEnableMetadataValidate() { + return Boolean.valueOf(configsMap.getOrDefault(ENABLE_METADATA_VALIDATE, false).toString()); + } + public int getNumInsertPartitions() { return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString()); } @@ -201,10 +208,18 @@ public boolean isDeleteInputData() { return Boolean.valueOf(configsMap.getOrDefault(DELETE_INPUT_DATA, false).toString()); } + public boolean isDeleteInputDataExceptLatest() { + return Boolean.valueOf(configsMap.getOrDefault(DELETE_INPUT_DATA_EXCEPT_LATEST, false).toString()); + } + public boolean isValidateHive() { return Boolean.valueOf(configsMap.getOrDefault(VALIDATE_HIVE, false).toString()); } + public boolean isValidateFullData() { + return Boolean.valueOf(configsMap.getOrDefault(VALIDATE_FULL_DATA, false).toString()); + } + public int getIterationCountToExecute() { return Integer.valueOf(configsMap.getOrDefault(EXECUTE_ITR_COUNT, -1).toString()); } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java index 1ae6d948f3e43..09d44d986e183 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/BaseValidateDatasetNode.java @@ -19,16 +19,14 @@ package org.apache.hudi.integ.testsuite.dag.nodes; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - import org.apache.hudi.DataSourceWriteOptions; -import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; import org.apache.hudi.integ.testsuite.schema.SchemaUtils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.ReduceFunction; import org.apache.spark.sql.Dataset; @@ -42,13 +40,13 @@ import org.apache.spark.sql.types.StructType; import org.slf4j.Logger; +import java.util.List; +import java.util.stream.Collectors; + import scala.Tuple2; import scala.collection.JavaConversions; import scala.collection.JavaConverters; -import java.util.List; -import java.util.stream.Collectors; - /** * This nodes validates contents from input path are in tact with Hudi. By default no configs are required for this node. But there is an * optional config "delete_input_data" that you can set for this node. If set, once validation completes, contents from inputPath are deleted. This will come in handy for long running test suites. @@ -78,6 +76,7 @@ public abstract Dataset getDatasetToValidate(SparkSession session, Executio public void execute(ExecutionContext context, int curItrCount) throws Exception { SparkSession session = SparkSession.builder().sparkContext(context.getJsc().sc()).getOrCreate(); + // todo: Fix partitioning schemes. For now, assumes data based partitioning. String inputPath = context.getHoodieTestSuiteWriter().getCfg().inputBasePath + "/*/*"; log.warn("Validation using data from input path " + inputPath); @@ -97,43 +96,60 @@ public void execute(ExecutionContext context, int curItrCount) throws Exception // read from hudi and remove meta columns. Dataset trimmedHudiDf = getDatasetToValidate(session, context, inputSnapshotDf.schema()); - Dataset intersectionDf = inputSnapshotDf.intersect(trimmedHudiDf); - long inputCount = inputSnapshotDf.count(); - long outputCount = trimmedHudiDf.count(); - log.debug("Input count: " + inputCount + "; output count: " + outputCount); - // the intersected df should be same as inputDf. if not, there is some mismatch. - if (outputCount == 0 || inputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) { - log.error("Data set validation failed. Total count in hudi " + outputCount + ", input df count " + inputCount); - throw new AssertionError("Hudi contents does not match contents input data. "); - } - - if (config.isValidateHive()) { - String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key()); - String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key()); - log.warn("Validating hive table with db : " + database + " and table : " + tableName); - Dataset cowDf = session.sql("SELECT * FROM " + database + "." + tableName); - Dataset trimmedCowDf = cowDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) - .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); - intersectionDf = inputSnapshotDf.intersect(trimmedCowDf); - outputCount = trimmedHudiDf.count(); - log.warn("Input count: " + inputCount + "; output count: " + outputCount); + if (config.isValidateFullData()) { + log.debug("Validating full dataset"); + Dataset exceptInputDf = inputSnapshotDf.except(trimmedHudiDf); + Dataset exceptHudiDf = trimmedHudiDf.except(inputSnapshotDf); + long exceptInputCount = exceptInputDf.count(); + long exceptHudiCount = exceptHudiDf.count(); + log.debug("Except input df count " + exceptInputDf + ", except hudi count " + exceptHudiCount); + if (exceptInputCount != 0 || exceptHudiCount != 0) { + log.error("Data set validation failed. Total count in hudi " + trimmedHudiDf.count() + ", input df count " + inputSnapshotDf.count() + + ". InputDf except hudi df = " + exceptInputCount + ", Hudi df except Input df " + exceptHudiCount); + throw new AssertionError("Hudi contents does not match contents input data. "); + } + } else { + Dataset intersectionDf = inputSnapshotDf.intersect(trimmedHudiDf); + long inputCount = inputSnapshotDf.count(); + long outputCount = trimmedHudiDf.count(); + log.debug("Input count: " + inputCount + "; output count: " + outputCount); // the intersected df should be same as inputDf. if not, there is some mismatch. - if (outputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) { - log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount); - throw new AssertionError("Hudi hive table contents does not match contents input data. "); + if (outputCount == 0 || inputCount == 0 || inputSnapshotDf.except(intersectionDf).count() != 0) { + log.error("Data set validation failed. Total count in hudi " + outputCount + ", input df count " + inputCount); + throw new AssertionError("Hudi contents does not match contents input data. "); } - } - // if delete input data is enabled, erase input data. - if (config.isDeleteInputData()) { - // clean up input data for current group of writes. - inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; - FileSystem fs = new Path(inputPathStr) - .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); - FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); - for (FileStatus fileStatus : fileStatuses) { - log.debug("Micro batch to be deleted " + fileStatus.getPath().toString()); - fs.delete(fileStatus.getPath(), true); + if (config.isValidateHive()) { + String database = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_DATABASE().key()); + String tableName = context.getWriterContext().getProps().getString(DataSourceWriteOptions.HIVE_TABLE().key()); + log.warn("Validating hive table with db : " + database + " and table : " + tableName); + session.sql("REFRESH TABLE " + database + "." + tableName); + Dataset cowDf = session.sql("SELECT _row_key, rider, driver, begin_lat, begin_lon, end_lat, end_lon, fare, _hoodie_is_deleted, " + + "test_suite_source_ordering_field FROM " + database + "." + tableName); + Dataset reorderedInputDf = inputSnapshotDf.select("_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", + "_hoodie_is_deleted", "test_suite_source_ordering_field"); + + Dataset intersectedHiveDf = reorderedInputDf.intersect(cowDf); + outputCount = trimmedHudiDf.count(); + log.warn("Input count: " + inputCount + "; output count: " + outputCount); + // the intersected df should be same as inputDf. if not, there is some mismatch. + if (outputCount == 0 || reorderedInputDf.except(intersectedHiveDf).count() != 0) { + log.error("Data set validation failed for COW hive table. Total count in hudi " + outputCount + ", input df count " + inputCount); + throw new AssertionError("Hudi hive table contents does not match contents input data. "); + } + } + + // if delete input data is enabled, erase input data. + if (config.isDeleteInputData()) { + // clean up input data for current group of writes. + inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; + FileSystem fs = new Path(inputPathStr) + .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); + FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); + for (FileStatus fileStatus : fileStatuses) { + log.debug("Micro batch to be deleted " + fileStatus.getPath().toString()); + fs.delete(fileStatus.getPath(), true); + } } } } @@ -146,8 +162,8 @@ private Dataset getInputDf(ExecutionContext context, SparkSession session, Dataset inputDf = session.read().format("avro").load(inputPath); ExpressionEncoder encoder = getEncoder(inputDf.schema()); return inputDf.groupByKey( - (MapFunction) value -> - value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField), Encoders.STRING()) + (MapFunction) value -> + value.getAs(partitionPathField) + "+" + value.getAs(recordKeyField), Encoders.STRING()) .reduceGroups((ReduceFunction) (v1, v2) -> { int ts1 = v1.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); int ts2 = v2.getAs(SchemaUtils.SOURCE_ORDERING_FIELD); diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteInputDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteInputDatasetNode.java new file mode 100644 index 0000000000000..2836f240ead3c --- /dev/null +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/DeleteInputDatasetNode.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes; + +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig; +import org.apache.hudi.integ.testsuite.dag.ExecutionContext; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + * Deletes all input except latest batch. Mostly used in insert_overwrite operations. + */ +public class DeleteInputDatasetNode extends DagNode { + + public DeleteInputDatasetNode(DeltaConfig.Config config) { + this.config = config; + } + + @Override + public void execute(ExecutionContext context, int curItrCount) throws Exception { + + String latestBatch = String.valueOf(context.getWriterContext().getDeltaGenerator().getBatchId()); + + if (config.isDeleteInputDataExceptLatest()) { + String inputPathStr = context.getHoodieTestSuiteWriter().getCfg().inputBasePath; + FileSystem fs = new Path(inputPathStr) + .getFileSystem(context.getHoodieTestSuiteWriter().getConfiguration()); + FileStatus[] fileStatuses = fs.listStatus(new Path(inputPathStr)); + for (FileStatus fileStatus : fileStatuses) { + if (!fileStatus.getPath().getName().equals(latestBatch)) { + log.debug("Micro batch to be deleted " + fileStatus.getPath().toString()); + fs.delete(fileStatus.getPath(), true); + } + } + } + } +} diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java index 03b37a9fc2b39..cc293ea470164 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/dag/nodes/ValidateDatasetNode.java @@ -18,6 +18,7 @@ package org.apache.hudi.integ.testsuite.dag.nodes; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config; import org.apache.hudi.integ.testsuite.dag.ExecutionContext; @@ -49,7 +50,8 @@ public Dataset getDatasetToValidate(SparkSession session, ExecutionContext StructType inputSchema) { String hudiPath = context.getHoodieTestSuiteWriter().getCfg().targetBasePath + "/*/*/*"; log.info("Validate data in target hudi path " + hudiPath); - Dataset hudiDf = session.read().format("hudi").load(hudiPath); + Dataset hudiDf = session.read().option(HoodieMetadataConfig.ENABLE.key(), String.valueOf(config.isEnableMetadataValidate())) + .format("hudi").load(hudiPath); return hudiDf.drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) .drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.FILENAME_METADATA_FIELD); } diff --git a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java index 6d5bc4ffedeca..69e32dfbc1182 100644 --- a/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java +++ b/hudi-integ-test/src/main/java/org/apache/hudi/integ/testsuite/generator/DeltaGenerator.java @@ -110,6 +110,10 @@ public JavaRDD writeRecords(JavaRDD records) { return ws; } + public int getBatchId() { + return batchId; + } + public JavaRDD generateInserts(Config operation) { int numPartitions = operation.getNumInsertPartitions(); long recordsPerPartition = operation.getNumRecordsInsert(); diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala index 1b69cf8faf494..b8c46cad3fd69 100644 --- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertNode.scala @@ -54,13 +54,18 @@ class SparkInsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { context.getWriterContext.getSparkSession) inputDF.write.format("hudi") .options(DataSourceWriteOptions.translateSqlOptions(context.getWriterContext.getProps.asScala.toMap)) + .option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "test_suite_source_ordering_field") .option(DataSourceWriteOptions.TABLE_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) .option(DataSourceWriteOptions.TABLE_TYPE.key, context.getHoodieTestSuiteWriter.getCfg.tableType) - .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .option(DataSourceWriteOptions.OPERATION.key, getOperation()) .option(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX.key, "deltastreamer.checkpoint.key") .option("deltastreamer.checkpoint.key", context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse("")) .option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) - .mode(SaveMode.Overwrite) + .mode(SaveMode.Append) .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath) } + + def getOperation(): String = { + DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL + } } diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteNode.scala new file mode 100644 index 0000000000000..6dd2eac522974 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteNode.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config + +class SparkInsertOverwriteNode(dagNodeConfig: Config) extends SparkInsertNode(dagNodeConfig) { + + override def getOperation(): String = { + DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL + } + +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteTableNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteTableNode.scala new file mode 100644 index 0000000000000..a6b80b3a90cc1 --- /dev/null +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkInsertOverwriteTableNode.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.integ.testsuite.dag.nodes + +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config + +class SparkInsertOverwriteTableNode(dagNodeConfig: Config) extends SparkInsertNode(dagNodeConfig) { + + override def getOperation(): String = { + DataSourceWriteOptions.INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL + } +} diff --git a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala index 858827a7b2c47..113de93adbb3a 100644 --- a/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala +++ b/hudi-integ-test/src/main/scala/org/apache/hudi/integ/testsuite/dag/nodes/SparkUpsertNode.scala @@ -18,49 +18,17 @@ package org.apache.hudi.integ.testsuite.dag.nodes -import org.apache.hudi.client.WriteStatus -import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.DataSourceWriteOptions import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config -import org.apache.hudi.integ.testsuite.dag.ExecutionContext -import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SaveMode - -import scala.collection.JavaConverters._ /** * Spark datasource based upsert node * * @param dagNodeConfig DAG node configurations. */ -class SparkUpsertNode(dagNodeConfig: Config) extends DagNode[RDD[WriteStatus]] { - - config = dagNodeConfig +class SparkUpsertNode(dagNodeConfig: Config) extends SparkInsertNode(dagNodeConfig) { - /** - * Execute the {@link DagNode}. - * - * @param context The context needed for an execution of a node. - * @param curItrCount iteration count for executing the node. - * @throws Exception Thrown if the execution failed. - */ - override def execute(context: ExecutionContext, curItrCount: Int): Unit = { - if (!config.isDisableGenerate) { - println("Generating input data for node {}", this.getName) - context.getDeltaGenerator().writeRecords(context.getDeltaGenerator().generateInserts(config)).count() - } - val inputDF = AvroConversionUtils.createDataFrame(context.getWriterContext.getHoodieTestSuiteWriter.getNextBatch, - context.getWriterContext.getHoodieTestSuiteWriter.getSchema, - context.getWriterContext.getSparkSession) - inputDF.write.format("hudi") - .options(DataSourceWriteOptions.translateSqlOptions(context.getWriterContext.getProps.asScala.toMap)) - .option(DataSourceWriteOptions.TABLE_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) - .option(DataSourceWriteOptions.TABLE_TYPE.key, context.getHoodieTestSuiteWriter.getCfg.tableType) - .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) - .option(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX.key, "deltastreamer.checkpoint.key") - .option("deltastreamer.checkpoint.key", context.getWriterContext.getHoodieTestSuiteWriter.getLastCheckpoint.orElse("")) - .option(HoodieWriteConfig.TBL_NAME.key, context.getHoodieTestSuiteWriter.getCfg.targetTableName) - .mode(SaveMode.Append) - .save(context.getHoodieTestSuiteWriter.getWriteConfig.getBasePath) + override def getOperation(): String = { + DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL } } diff --git a/hudi-kafka-connect/pom.xml b/hudi-kafka-connect/pom.xml index b12468579161e..8845bfb801ae3 100644 --- a/hudi-kafka-connect/pom.xml +++ b/hudi-kafka-connect/pom.xml @@ -66,7 +66,7 @@ com.github.os72 protoc-jar-maven-plugin - 3.1.0.1 + 3.11.4 generate-sources diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java index a579484f67369..649150d16c828 100644 --- a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java @@ -20,6 +20,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; import org.apache.hudi.common.util.Option; @@ -81,7 +82,7 @@ public void writeRecord(SinkRecord record) throws IOException { } // Tag records with a file ID based on kafka partition and hudi partition. - HoodieRecord hoodieRecord = new HoodieRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord)); + HoodieRecord hoodieRecord = new HoodieAvroRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord)); String fileId = KafkaConnectUtils.hashDigest(String.format("%s-%s", record.kafkaPartition(), hoodieRecord.getPartitionPath())); hoodieRecord.unseal(); hoodieRecord.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId)); diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java index c8a3ad6ffd92e..7a286e565ea34 100644 --- a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java @@ -22,6 +22,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.SchemaTestUtil; @@ -168,7 +169,7 @@ protected List flushRecords() { } private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) { - return new HoodieRecord<>(new HoodieKey(key, partitionPath), + return new HoodieAvroRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Option.of((GenericRecord) iRecord))); } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java index a632b5a4e9096..bad6c2d7219e1 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -18,14 +18,12 @@ package org.apache.hudi; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.HoodieWriteResult; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -47,12 +45,17 @@ import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.util.DataTypeUtils; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; +import org.apache.spark.sql.hive.HiveExternalCatalog; import org.apache.spark.sql.types.StructType; import java.io.IOException; @@ -235,13 +238,13 @@ public static HoodieWriteResult doDeletePartitionsOperation(SparkRDDWriteClient public static HoodieRecord createHoodieRecord(GenericRecord gr, Comparable orderingVal, HoodieKey hKey, String payloadClass) throws IOException { HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr, orderingVal); - return new HoodieRecord<>(hKey, payload); + return new HoodieAvroRecord<>(hKey, payload); } public static HoodieRecord createHoodieRecord(GenericRecord gr, HoodieKey hKey, String payloadClass) throws IOException { HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr); - return new HoodieRecord<>(hKey, payload); + return new HoodieAvroRecord<>(hKey, payload); } /** @@ -290,6 +293,8 @@ public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String b props.getString(DataSourceWriteOptions.HIVE_PASS().key(), DataSourceWriteOptions.HIVE_PASS().defaultValue()); hiveSyncConfig.jdbcUrl = props.getString(DataSourceWriteOptions.HIVE_URL().key(), DataSourceWriteOptions.HIVE_URL().defaultValue()); + hiveSyncConfig.metastoreUris = + props.getString(DataSourceWriteOptions.METASTORE_URIS().key(), DataSourceWriteOptions.METASTORE_URIS().defaultValue()); hiveSyncConfig.partitionFields = props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), ",", new ArrayList<>()); hiveSyncConfig.partitionValueExtractorClass = @@ -314,6 +319,9 @@ public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String b (boolean) DataSourceWriteOptions.HIVE_SYNC_BUCKET_SYNC().defaultValue()) ? HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()), props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())) : null; + if (props.containsKey(HiveExternalCatalog.CREATED_SPARK_VERSION())) { + hiveSyncConfig.sparkVersion = props.getString(HiveExternalCatalog.CREATED_SPARK_VERSION()); + } return hiveSyncConfig; } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/SparkRowWriteHelper.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/SparkRowWriteHelper.java index 6f5dd3713d74f..ea9c9b2c03d93 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/SparkRowWriteHelper.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/SparkRowWriteHelper.java @@ -19,7 +19,6 @@ package org.apache.hudi; import org.apache.hudi.common.model.HoodieRecord; - import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.ReduceFunction; import org.apache.spark.sql.Dataset; @@ -30,14 +29,13 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder; import org.apache.spark.sql.catalyst.expressions.Attribute; import org.apache.spark.sql.types.StructType; - -import java.util.List; -import java.util.stream.Collectors; - import scala.Tuple2; import scala.collection.JavaConversions; import scala.collection.JavaConverters; +import java.util.List; +import java.util.stream.Collectors; + /** * Helper class to assist in deduplicating Rows for BulkInsert with Rows. */ @@ -55,20 +53,13 @@ public static SparkRowWriteHelper newInstance() { } public Dataset deduplicateRows(Dataset inputDf, String preCombineField, boolean isGlobalIndex) { - ExpressionEncoder encoder = getEncoder(inputDf.schema()); - - return inputDf.groupByKey( - (MapFunction) value -> - isGlobalIndex ? (value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)) : - (value.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + "+" + value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)), Encoders.STRING()) - .reduceGroups((ReduceFunction) (v1, v2) -> { - if (((Comparable) v1.getAs(preCombineField)).compareTo(((Comparable) v2.getAs(preCombineField))) >= 0) { - return v1; - } else { - return v2; - } - } - ).map((MapFunction, Row>) value -> value._2, encoder); + return inputDf.groupByKey((MapFunction) value -> + isGlobalIndex + ? (value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)) + : (value.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + "+" + value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)), Encoders.STRING()) + .reduceGroups((ReduceFunction) (v1, v2) -> + ((Comparable) v1.getAs(preCombineField)).compareTo(v2.getAs(preCombineField)) >= 0 ? v1 : v2) + .map((MapFunction, Row>) value -> value._2, getEncoder(inputDf.schema())); } private ExpressionEncoder getEncoder(StructType schema) { diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java index 81d880ee974df..f87e16a652900 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncClusteringService.java @@ -19,8 +19,8 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractClusteringClient; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseClusterer; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.client.HoodieSparkClusteringClient; /** @@ -31,12 +31,12 @@ public class SparkStreamingAsyncClusteringService extends AsyncClusteringService private static final long serialVersionUID = 1L; - public SparkStreamingAsyncClusteringService(AbstractHoodieWriteClient writeClient) { + public SparkStreamingAsyncClusteringService(BaseHoodieWriteClient writeClient) { super(writeClient, true); } @Override - protected AbstractClusteringClient createClusteringClient(AbstractHoodieWriteClient client) { + protected BaseClusterer createClusteringClient(BaseHoodieWriteClient client) { return new HoodieSparkClusteringClient(client); } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java index 130ea7c27595a..2ff7b46c02018 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java +++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/async/SparkStreamingAsyncCompactService.java @@ -18,8 +18,8 @@ package org.apache.hudi.async; -import org.apache.hudi.client.AbstractCompactor; -import org.apache.hudi.client.AbstractHoodieWriteClient; +import org.apache.hudi.client.BaseCompactor; +import org.apache.hudi.client.BaseHoodieWriteClient; import org.apache.hudi.client.HoodieSparkCompactor; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -31,12 +31,12 @@ public class SparkStreamingAsyncCompactService extends AsyncCompactService { private static final long serialVersionUID = 1L; - public SparkStreamingAsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client) { + public SparkStreamingAsyncCompactService(HoodieEngineContext context, BaseHoodieWriteClient client) { super(context, client, true); } @Override - protected AbstractCompactor createCompactor(AbstractHoodieWriteClient client) { + protected BaseCompactor createCompactor(BaseHoodieWriteClient client) { return new HoodieSparkCompactor(client, this.context); } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyViewRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyViewRelation.scala new file mode 100644 index 0000000000000..8e94805328c69 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyViewRelation.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.fs.Path + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.hadoop.HoodieROTablePathFilter + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionedFile} +import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.sources.{BaseRelation, Filter} +import org.apache.spark.sql.types.{BooleanType, StructType} + +/** + * The implement of [[BaseRelation]], which is used to respond to query that only touches the base files(Parquet), + * like query COW tables in Snapshot-Query and Read_Optimized mode and MOR tables in Read_Optimized mode. + */ +class BaseFileOnlyViewRelation( + sqlContext: SQLContext, + metaClient: HoodieTableMetaClient, + optParams: Map[String, String], + userSchema: Option[StructType], + globPaths: Seq[Path] + ) extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) with SparkAdapterSupport { + + override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { + sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false") + + val filterExpressions = HoodieSparkUtils.convertToCatalystExpressions(filters, tableStructSchema) + .getOrElse(Literal(true, BooleanType)) + val (partitionFilters, dataFilters) = { + val splited = filters.map { filter => + HoodieDataSourceHelper.splitPartitionAndDataPredicates( + sparkSession, filterExpressions, partitionColumns) + } + (splited.flatMap(_._1), splited.flatMap(_._2)) + } + val partitionFiles = getPartitionFiles(partitionFilters, dataFilters) + + val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes + val filePartitions = sparkAdapter.getFilePartitions(sparkSession, partitionFiles, maxSplitBytes) + + val requiredSchemaParquetReader = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = sparkSession, + dataSchema = tableStructSchema, + partitionSchema = StructType(Nil), + requiredSchema = tableStructSchema, + filters = filters, + options = optParams, + hadoopConf = sparkSession.sessionState.newHadoopConf() + ) + + new HoodieFileScanRDD(sparkSession, requiredColumns, tableStructSchema, + requiredSchemaParquetReader, filePartitions) + } + + private def getPartitionFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionedFile] = { + val partitionDirectories = if (globPaths.isEmpty) { + val hoodieFileIndex = HoodieFileIndex(sparkSession, metaClient, userSchema, optParams, + FileStatusCache.getOrCreate(sqlContext.sparkSession)) + hoodieFileIndex.listFiles(partitionFilters, dataFilters) + } else { + sqlContext.sparkContext.hadoopConfiguration.setClass( + "mapreduce.input.pathFilter.class", + classOf[HoodieROTablePathFilter], + classOf[org.apache.hadoop.fs.PathFilter]) + + val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sparkSession, globPaths) + inMemoryFileIndex.listFiles(partitionFilters, dataFilters) + } + + val partitionFiles = partitionDirectories.flatMap { partition => + partition.files.flatMap { file => + HoodieDataSourceHelper.splitFiles( + sparkSession = sparkSession, + file = file, + partitionValues = partition.values + ) + } + } + + partitionFiles.map{ f => + PartitionedFile(InternalRow.empty, f.filePath, f.start, f.length) + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 1e1d887906c99..8a98657f242e2 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -23,7 +23,7 @@ import org.apache.hudi.common.fs.ConsistencyGuardConfig import org.apache.hudi.common.model.{HoodieTableType, WriteOperationType} import org.apache.hudi.common.table.HoodieTableConfig import org.apache.hudi.common.util.Option -import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig} import org.apache.hudi.hive.util.ConfigUtils import org.apache.hudi.hive.{HiveStylePartitionValueExtractor, HiveSyncTool, MultiPartKeysValueExtractor, NonPartitionedExtractor, SlashEncodedDayPartitionValueExtractor} import org.apache.hudi.keygen.constant.KeyGeneratorOptions @@ -75,6 +75,7 @@ object DataSourceReadOptions { val ENABLE_HOODIE_FILE_INDEX: ConfigProperty[Boolean] = ConfigProperty .key("hoodie.file.index.enable") .defaultValue(true) + .deprecatedAfter("0.11.0") .withDocumentation("Enables use of the spark file index implementation for Hudi, " + "that speeds up listing of large tables.") @@ -119,8 +120,13 @@ object DataSourceReadOptions { .key("hoodie.enable.data.skipping") .defaultValue(true) .sinceVersion("0.10.0") - .withDocumentation("enable data skipping to boost query after doing z-order optimize for current table") + .withDocumentation("Enables data-skipping allowing queries to leverage indexes to reduce the search space by " + + "skipping over files") + val INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.read.incr.fallback.fulltablescan.enable") + .defaultValue("false") + .withDocumentation("When doing an incremental query whether we should fall back to full table scans if file does not exist.") /** @deprecated Use {@link QUERY_TYPE} and its methods instead */ @Deprecated val QUERY_TYPE_OPT_KEY = QUERY_TYPE.key() @@ -422,6 +428,11 @@ object DataSourceWriteOptions { val HIVE_URL: ConfigProperty[String] = ConfigProperty .key("hoodie.datasource.hive_sync.jdbcurl") .defaultValue("jdbc:hive2://localhost:10000") + .withDocumentation("Hive jdbc url") + + val METASTORE_URIS: ConfigProperty[String] = ConfigProperty + .key("hoodie.datasource.hive_sync.metastore.uris") + .defaultValue("thrift://localhost:9083") .withDocumentation("Hive metastore url") val hivePartitionFieldsInferFunc = DataSourceOptionsHelper.scalaFunctionToJavaFunction((p: HoodieConfig) => { @@ -550,17 +561,9 @@ object DataSourceWriteOptions { .defaultValue("true") .withDocumentation("Controls whether async compaction should be turned on for MOR table writing.") - val INLINE_CLUSTERING_ENABLE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.clustering.inline.enable") - .defaultValue("false") - .sinceVersion("0.9.0") - .withDocumentation("Enable inline clustering. Disabled by default.") + val INLINE_CLUSTERING_ENABLE = HoodieClusteringConfig.INLINE_CLUSTERING - val ASYNC_CLUSTERING_ENABLE: ConfigProperty[String] = ConfigProperty - .key("hoodie.datasource.clustering.async.enable") - .defaultValue("false") - .sinceVersion("0.9.0") - .withDocumentation("Enable asynchronous clustering. Disabled by default.") + val ASYNC_CLUSTERING_ENABLE = HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE val KAFKA_AVRO_VALUE_DESERIALIZER_CLASS: ConfigProperty[String] = ConfigProperty .key("hoodie.deltastreamer.source.kafka.value.deserializer.class") @@ -632,10 +635,10 @@ object DataSourceWriteOptions { @Deprecated val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = HIVE_PARTITION_EXTRACTOR_CLASS.key() - /** @deprecated Use {@link KEYGENERATOR_CLASS} and its methods instead */ + /** @deprecated Use {@link KEYGENERATOR_CLASS_NAME} and its methods instead */ @Deprecated val DEFAULT_KEYGENERATOR_CLASS_OPT_VAL = KEYGENERATOR_CLASS_NAME.defaultValue() - /** @deprecated Use {@link KEYGENERATOR_CLASS} and its methods instead */ + /** @deprecated Use {@link KEYGENERATOR_CLASS_NAME} and its methods instead */ @Deprecated val KEYGENERATOR_CLASS_OPT_KEY = HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key() /** @deprecated Use {@link ENABLE_ROW_WRITER} and its methods instead */ diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala index 4c9e585c363e5..1508babcbba97 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala @@ -85,20 +85,15 @@ class DefaultSource extends RelationProvider val allPaths = path.map(p => Seq(p)).getOrElse(Seq()) ++ readPaths val fs = FSUtils.getFs(allPaths.head, sqlContext.sparkContext.hadoopConfiguration) - // Use the HoodieFileIndex only if the 'path' is not globbed. - // Or else we use the original way to read hoodie table. - val enableFileIndex = optParams.get(ENABLE_HOODIE_FILE_INDEX.key) - .map(_.toBoolean).getOrElse(ENABLE_HOODIE_FILE_INDEX.defaultValue) - val useHoodieFileIndex = enableFileIndex && path.isDefined && !path.get.contains("*") && - !parameters.contains(DataSourceReadOptions.READ_PATHS.key) - val globPaths = if (useHoodieFileIndex) { - None + + val globPaths = if (path.exists(_.contains("*")) || readPaths.nonEmpty) { + HoodieSparkUtils.checkAndGlobPathIfNecessary(allPaths, fs) } else { - Some(HoodieSparkUtils.checkAndGlobPathIfNecessary(allPaths, fs)) + Seq.empty } // Get the table base path - val tablePath = if (globPaths.isDefined) { - DataSourceUtils.getTablePath(fs, globPaths.get.toArray) + val tablePath = if (globPaths.nonEmpty) { + DataSourceUtils.getTablePath(fs, globPaths.toArray) } else { DataSourceUtils.getTablePath(fs, Array(new Path(path.get))) } @@ -108,6 +103,7 @@ class DefaultSource extends RelationProvider val isBootstrappedTable = metaClient.getTableConfig.getBootstrapBasePath.isPresent val tableType = metaClient.getTableType val queryType = parameters(QUERY_TYPE.key) + val userSchema = if (schema == null) Option.empty[StructType] else Some(schema) log.info(s"Is bootstrapped table => $isBootstrappedTable, tableType is: $tableType, queryType is: $queryType") if (metaClient.getCommitsTimeline.filterCompletedInstants.countInstants() == 0) { @@ -117,20 +113,19 @@ class DefaultSource extends RelationProvider case (COPY_ON_WRITE, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) | (COPY_ON_WRITE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) | (MERGE_ON_READ, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) => - getBaseFileOnlyView(useHoodieFileIndex, sqlContext, parameters, schema, tablePath, - readPaths, metaClient) + new BaseFileOnlyViewRelation(sqlContext, metaClient, parameters, userSchema, globPaths) case (COPY_ON_WRITE, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) => - new IncrementalRelation(sqlContext, parameters, schema, metaClient) + new IncrementalRelation(sqlContext, parameters, userSchema, metaClient) case (MERGE_ON_READ, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) => - new MergeOnReadSnapshotRelation(sqlContext, parameters, schema, globPaths, metaClient) + new MergeOnReadSnapshotRelation(sqlContext, parameters, userSchema, globPaths, metaClient) case (MERGE_ON_READ, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) => - new MergeOnReadIncrementalRelation(sqlContext, parameters, schema, metaClient) + new MergeOnReadIncrementalRelation(sqlContext, parameters, userSchema, metaClient) case (_, _, true) => - new HoodieBootstrapRelation(sqlContext, schema, globPaths, metaClient, parameters) + new HoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, parameters) case (_, _, _) => throw new HoodieException(s"Invalid query type : $queryType for tableType: $tableType," + @@ -182,65 +177,6 @@ class DefaultSource extends RelationProvider override def shortName(): String = "hudi_v1" - private def getBaseFileOnlyView(useHoodieFileIndex: Boolean, - sqlContext: SQLContext, - optParams: Map[String, String], - schema: StructType, - tablePath: String, - extraReadPaths: Seq[String], - metaClient: HoodieTableMetaClient): BaseRelation = { - log.info("Loading Base File Only View with options :" + optParams) - val (tableFileFormat, formatClassName) = metaClient.getTableConfig.getBaseFileFormat match { - case HoodieFileFormat.PARQUET => (new ParquetFileFormat, "parquet") - case HoodieFileFormat.ORC => (new OrcFileFormat, "orc") - } - - if (useHoodieFileIndex) { - val fileIndex = HoodieFileIndex(sqlContext.sparkSession, metaClient, - if (schema == null) Option.empty[StructType] else Some(schema), - optParams, FileStatusCache.getOrCreate(sqlContext.sparkSession)) - - HadoopFsRelation( - fileIndex, - fileIndex.partitionSchema, - fileIndex.dataSchema, - bucketSpec = None, - fileFormat = tableFileFormat, - optParams)(sqlContext.sparkSession) - } else { - // this is just effectively RO view only, where `path` can contain a mix of - // non-hoodie/hoodie path files. set the path filter up - sqlContext.sparkContext.hadoopConfiguration.setClass( - "mapreduce.input.pathFilter.class", - classOf[HoodieROTablePathFilter], - classOf[org.apache.hadoop.fs.PathFilter]) - - val specifySchema = if (schema == null) { - // Load the schema from the commit meta data. - // Here we should specify the schema to the latest commit schema since - // the table schema evolution. - val tableSchemaResolver = new TableSchemaResolver(metaClient) - try { - Some(AvroConversionUtils.convertAvroSchemaToStructType(tableSchemaResolver.getTableAvroSchema)) - } catch { - case _: Throwable => - None // If there is no commit in the table, we can not get the schema - // with tableSchemaResolver, return None here. - } - } else { - Some(schema) - } - // simply return as a regular relation - DataSource.apply( - sparkSession = sqlContext.sparkSession, - paths = extraReadPaths, - userSpecifiedSchema = specifySchema, - className = formatClassName, - options = optParams) - .resolveRelation() - } - } - override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala new file mode 100644 index 0000000000000..1e2946dd26e88 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.avro.generic.GenericRecord +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.hbase.io.hfile.CacheConfig +import org.apache.hudi.common.config.SerializableConfiguration +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.model.HoodieFileFormat +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.io.storage.HoodieHFileReader +import org.apache.hudi.metadata.HoodieTableMetadata +import org.apache.spark.internal.Logging +import org.apache.spark.sql.avro.SchemaConverters +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{SQLContext, SparkSession} + +import scala.collection.JavaConverters._ +import scala.util.Try + +case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String) + +/** + * Hoodie BaseRelation which extends [[PrunedFilteredScan]]. + */ +abstract class HoodieBaseRelation( + val sqlContext: SQLContext, + metaClient: HoodieTableMetaClient, + optParams: Map[String, String], + userSchema: Option[StructType]) + extends BaseRelation with PrunedFilteredScan with Logging{ + + protected val sparkSession: SparkSession = sqlContext.sparkSession + + protected lazy val tableAvroSchema: Schema = { + val schemaUtil = new TableSchemaResolver(metaClient) + Try(schemaUtil.getTableAvroSchema).getOrElse( + // If there is no commit in the table, we can't get the schema + // t/h [[TableSchemaResolver]], fallback to the provided [[userSchema]] instead. + userSchema match { + case Some(s) => SchemaConverters.toAvroType(s) + case _ => throw new IllegalArgumentException("User-provided schema is required in case the table is empty") + } + ) + } + + protected val tableStructSchema: StructType = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema) + + protected val partitionColumns: Array[String] = metaClient.getTableConfig.getPartitionFields.orElse(Array.empty) + + protected def getPrecombineFieldProperty: Option[String] = + Option(metaClient.getTableConfig.getPreCombineField) + .orElse(optParams.get(DataSourceWriteOptions.PRECOMBINE_FIELD.key)) match { + // NOTE: This is required to compensate for cases when empty string is used to stub + // property value to avoid it being set with the default value + // TODO(HUDI-3456) cleanup + case Some(f) if !StringUtils.isNullOrEmpty(f) => Some(f) + case _ => None + } + + override def schema: StructType = tableStructSchema +} + +object HoodieBaseRelation { + + def isMetadataTable(metaClient: HoodieTableMetaClient) = + HoodieTableMetadata.isMetadataTable(metaClient.getBasePath) + + /** + * Returns file-reader routine accepting [[PartitionedFile]] and returning an [[Iterator]] + * over [[InternalRow]] + */ + def createBaseFileReader(spark: SparkSession, + partitionSchema: StructType, + tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + val hfileReader = createHFileReader( + spark = spark, + tableSchema = tableSchema, + requiredSchema = requiredSchema, + filters = filters, + options = options, + hadoopConf = hadoopConf + ) + val parquetReader = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = spark, + dataSchema = tableSchema.structTypeSchema, + partitionSchema = partitionSchema, + requiredSchema = requiredSchema.structTypeSchema, + filters = filters, + options = options, + hadoopConf = hadoopConf + ) + + partitionedFile => { + val extension = FSUtils.getFileExtension(partitionedFile.filePath) + if (HoodieFileFormat.PARQUET.getFileExtension.equals(extension)) { + parquetReader.apply(partitionedFile) + } else if (HoodieFileFormat.HFILE.getFileExtension.equals(extension)) { + hfileReader.apply(partitionedFile) + } else { + throw new UnsupportedOperationException(s"Base file format not supported by Spark DataSource ($partitionedFile)") + } + } + } + + private def createHFileReader(spark: SparkSession, + tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + val hadoopConfBroadcast = + spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) + + partitionedFile => { + val hadoopConf = hadoopConfBroadcast.value.get() + val reader = new HoodieHFileReader[GenericRecord](hadoopConf, new Path(partitionedFile.filePath), + new CacheConfig(hadoopConf)) + + val requiredRowSchema = requiredSchema.structTypeSchema + // NOTE: Schema has to be parsed at this point, since Avro's [[Schema]] aren't serializable + // to be passed from driver to executor + val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) + val avroToRowConverter = AvroConversionUtils.createAvroToInternalRowConverter(requiredAvroSchema, requiredRowSchema) + + reader.getRecordIterator(requiredAvroSchema).asScala + .map(record => { + avroToRowConverter.apply(record.asInstanceOf[GenericRecord]).get + }) + } + } +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala index a522db6afc6f1..ea997c86acb39 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRDD.scala @@ -24,12 +24,13 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.ColumnarBatch + +import org.apache.hudi.HoodieDataSourceHelper._ class HoodieBootstrapRDD(@transient spark: SparkSession, - dataReadFunction: PartitionedFile => Iterator[Any], - skeletonReadFunction: PartitionedFile => Iterator[Any], - regularReadFunction: PartitionedFile => Iterator[Any], + dataReadFunction: PartitionedFile => Iterator[InternalRow], + skeletonReadFunction: PartitionedFile => Iterator[InternalRow], + regularReadFunction: PartitionedFile => Iterator[InternalRow], dataSchema: StructType, skeletonSchema: StructType, requiredColumns: Array[String], @@ -56,18 +57,18 @@ class HoodieBootstrapRDD(@transient spark: SparkSession, // It is a bootstrap split. Check both skeleton and data files. if (dataSchema.isEmpty) { // No data column to fetch, hence fetch only from skeleton file - partitionedFileIterator = read(bootstrapPartition.split.skeletonFile.get, skeletonReadFunction) + partitionedFileIterator = skeletonReadFunction(bootstrapPartition.split.skeletonFile.get) } else if (skeletonSchema.isEmpty) { // No metadata column to fetch, hence fetch only from data file - partitionedFileIterator = read(bootstrapPartition.split.dataFile, dataReadFunction) + partitionedFileIterator = dataReadFunction(bootstrapPartition.split.dataFile) } else { // Fetch from both data and skeleton file, and merge - val dataFileIterator = read(bootstrapPartition.split.dataFile, dataReadFunction) - val skeletonFileIterator = read(bootstrapPartition.split.skeletonFile.get, skeletonReadFunction) + val dataFileIterator = dataReadFunction(bootstrapPartition.split.dataFile) + val skeletonFileIterator = skeletonReadFunction(bootstrapPartition.split.skeletonFile.get) partitionedFileIterator = merge(skeletonFileIterator, dataFileIterator) } } else { - partitionedFileIterator = read(bootstrapPartition.split.dataFile, regularReadFunction) + partitionedFileIterator = regularReadFunction(bootstrapPartition.split.dataFile) } partitionedFileIterator } @@ -101,19 +102,6 @@ class HoodieBootstrapRDD(@transient spark: SparkSession, mergedRow } - def read(partitionedFile: PartitionedFile, readFileFunction: PartitionedFile => Iterator[Any]) - : Iterator[InternalRow] = { - val fileIterator = readFileFunction(partitionedFile) - - import scala.collection.JavaConverters._ - - val rows = fileIterator.flatMap(_ match { - case r: InternalRow => Seq(r) - case b: ColumnarBatch => b.rowIterator().asScala - }) - rows - } - override protected def getPartitions: Array[Partition] = { tableState.files.zipWithIndex.map(file => { if (file._1.skeletonFile.isDefined) { diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala index b1ab83a94cc9d..dd90d724c6b61 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala @@ -52,8 +52,8 @@ import scala.collection.JavaConverters._ * @param optParams DataSource options passed by the user */ class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext, - val userSchema: StructType, - val globPaths: Option[Seq[Path]], + val userSchema: Option[StructType], + val globPaths: Seq[Path], val metaClient: HoodieTableMetaClient, val optParams: Map[String, String]) extends BaseRelation with PrunedFilteredScan with Logging { @@ -107,37 +107,35 @@ class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext, }) // Prepare readers for reading data file and skeleton files - val dataReadFunction = new ParquetFileFormat() - .buildReaderWithPartitionValues( - sparkSession = _sqlContext.sparkSession, - dataSchema = dataSchema, - partitionSchema = StructType(Seq.empty), - requiredSchema = requiredDataSchema, - filters = if (requiredSkeletonSchema.isEmpty) filters else Seq() , - options = Map.empty, - hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() - ) - - val skeletonReadFunction = new ParquetFileFormat() - .buildReaderWithPartitionValues( - sparkSession = _sqlContext.sparkSession, - dataSchema = skeletonSchema, - partitionSchema = StructType(Seq.empty), - requiredSchema = requiredSkeletonSchema, - filters = if (requiredDataSchema.isEmpty) filters else Seq(), - options = Map.empty, - hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() - ) - - val regularReadFunction = new ParquetFileFormat() - .buildReaderWithPartitionValues( - sparkSession = _sqlContext.sparkSession, - dataSchema = fullSchema, - partitionSchema = StructType(Seq.empty), - requiredSchema = requiredColsSchema, - filters = filters, - options = Map.empty, - hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf()) + val dataReadFunction = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = _sqlContext.sparkSession, + dataSchema = dataSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredDataSchema, + filters = if (requiredSkeletonSchema.isEmpty) filters else Seq() , + options = optParams, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() + ) + + val skeletonReadFunction = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = _sqlContext.sparkSession, + dataSchema = skeletonSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredSkeletonSchema, + filters = if (requiredDataSchema.isEmpty) filters else Seq(), + options = optParams, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() + ) + + val regularReadFunction = HoodieDataSourceHelper.buildHoodieParquetReader( + sparkSession = _sqlContext.sparkSession, + dataSchema = fullSchema, + partitionSchema = StructType(Seq.empty), + requiredSchema = requiredColsSchema, + filters = filters, + options = optParams, + hadoopConf = _sqlContext.sparkSession.sessionState.newHadoopConf() + ) val rdd = new HoodieBootstrapRDD(_sqlContext.sparkSession, dataReadFunction, skeletonReadFunction, regularReadFunction, requiredDataSchema, requiredSkeletonSchema, requiredColumns, tableState) @@ -157,9 +155,9 @@ class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext, def buildFileIndex(): HoodieBootstrapFileIndex = { logInfo("Building file index..") - val fileStatuses = if (globPaths.isDefined) { + val fileStatuses = if (globPaths.nonEmpty) { // Load files from the global paths if it has defined to be compatible with the original mode - val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(_sqlContext.sparkSession, globPaths.get) + val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(_sqlContext.sparkSession, globPaths) inMemoryFileIndex.allFiles() } else { // Load files by the HoodieFileIndex. HoodieFileIndex(sqlContext.sparkSession, metaClient, Some(schema), optParams, diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala new file mode 100644 index 0000000000000..fb12549f620bd --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieDataSourceHelper.scala @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.FileStatus +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Expression, PredicateHelper, SpecificInternalRow, SubqueryExpression, UnsafeProjection} +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +import scala.collection.JavaConverters._ + +object HoodieDataSourceHelper extends PredicateHelper { + + /** + * Partition the given condition into two sequence of conjunctive predicates: + * - predicates that can be evaluated using metadata only. + * - other predicates. + */ + def splitPartitionAndDataPredicates( + spark: SparkSession, + condition: Expression, + partitionColumns: Seq[String]): (Seq[Expression], Seq[Expression]) = { + splitConjunctivePredicates(condition).partition( + isPredicateMetadataOnly(spark, _, partitionColumns)) + } + + /** + * Check if condition can be evaluated using only metadata. In Delta, this means the condition + * only references partition columns and involves no subquery. + */ + def isPredicateMetadataOnly( + spark: SparkSession, + condition: Expression, + partitionColumns: Seq[String]): Boolean = { + isPredicatePartitionColumnsOnly(spark, condition, partitionColumns) && + !SubqueryExpression.hasSubquery(condition) + } + + /** + * Does the predicate only contains partition columns? + */ + def isPredicatePartitionColumnsOnly( + spark: SparkSession, + condition: Expression, + partitionColumns: Seq[String]): Boolean = { + val nameEquality = spark.sessionState.analyzer.resolver + condition.references.forall { r => + partitionColumns.exists(nameEquality(r.name, _)) + } + } + + /** + * Wrapper `buildReaderWithPartitionValues` of [[ParquetFileFormat]] + * to deal with [[ColumnarBatch]] when enable parquet vectorized reader if necessary. + */ + def buildHoodieParquetReader(sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { + + val readParquetFile: PartitionedFile => Iterator[Any] = new ParquetFileFormat().buildReaderWithPartitionValues( + sparkSession = sparkSession, + dataSchema = dataSchema, + partitionSchema = partitionSchema, + requiredSchema = requiredSchema, + filters = filters, + options = options, + hadoopConf = hadoopConf + ) + + file: PartitionedFile => { + val iter = readParquetFile(file) + iter.flatMap { + case r: InternalRow => Seq(r) + case b: ColumnarBatch => b.rowIterator().asScala + } + } + } + + /** + * Extract the required schema from [[InternalRow]] + */ + def extractRequiredSchema( + iter: Iterator[InternalRow], + requiredSchema: StructType, + requiredFieldPos: Seq[Int]): Iterator[InternalRow] = { + val unsafeProjection = UnsafeProjection.create(requiredSchema) + val rows = iter.map { row => + unsafeProjection(createInternalRowWithSchema(row, requiredSchema, requiredFieldPos)) + } + rows + } + + /** + * Convert [[InternalRow]] to [[SpecificInternalRow]]. + */ + def createInternalRowWithSchema( + row: InternalRow, + schema: StructType, + positions: Seq[Int]): InternalRow = { + val rowToReturn = new SpecificInternalRow(schema) + var curIndex = 0 + schema.zip(positions).foreach { case (field, pos) => + val curField = if (row.isNullAt(pos)) { + null + } else { + row.get(pos, field.dataType) + } + rowToReturn.update(curIndex, curField) + curIndex += 1 + } + rowToReturn + } + + + def splitFiles( + sparkSession: SparkSession, + file: FileStatus, + partitionValues: InternalRow): Seq[PartitionedFile] = { + val filePath = file.getPath + val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes + (0L until file.getLen by maxSplitBytes).map { offset => + val remaining = file.getLen - offset + val size = if (remaining > maxSplitBytes) maxSplitBytes else remaining + PartitionedFile(partitionValues, filePath.toUri.toString, offset, size) + } + } + +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index f9a7620b9ffe9..9cdf5cc634ff9 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -18,20 +18,30 @@ package org.apache.hudi import org.apache.hadoop.fs.{FileStatus, Path} + import org.apache.hudi.HoodieFileIndex.getConfigProperties import org.apache.hudi.common.config.{HoodieMetadataConfig, TypedProperties} import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.util.StringUtils +import org.apache.hudi.keygen.constant.KeyGeneratorOptions +import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} + +import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{And, Expression} +import org.apache.spark.sql.catalyst.expressions.{And, Expression, Literal} import org.apache.spark.sql.execution.datasources.{FileIndex, FileStatusCache, NoopCache, PartitionDirectory} import org.apache.spark.sql.hudi.DataSkippingUtils.createColumnStatsIndexFilterExpr import org.apache.spark.sql.hudi.HoodieSqlCommonUtils import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{AnalysisException, Column, SparkSession} +import org.apache.spark.unsafe.types.UTF8String import scala.collection.JavaConverters._ import scala.util.{Failure, Success, Try} +import scala.util.control.NonFatal + +import java.text.SimpleDateFormat /** * A file index which support partition prune for hoodie snapshot and read-optimized query. @@ -72,7 +82,7 @@ case class HoodieFileIndex(spark: SparkSession, ) with FileIndex { - override def rootPaths: Seq[Path] = queryPaths + override def rootPaths: Seq[Path] = queryPaths.asScala def enableDataSkipping(): Boolean = { options.getOrElse(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), @@ -88,7 +98,7 @@ case class HoodieFileIndex(spark: SparkSession, * @return List of FileStatus for base files */ def allFiles: Seq[FileStatus] = { - cachedAllInputFileSlices.values.flatten + cachedAllInputFileSlices.values.asScala.flatMap(_.asScala) .filter(_.getBaseFile.isPresent) .map(_.getBaseFile.get().getFileStatus) .toSeq @@ -101,31 +111,33 @@ case class HoodieFileIndex(spark: SparkSession, * @param dataFilters data columns filters * @return list of PartitionDirectory containing partition to base files mapping */ - override def listFiles(partitionFilters: Seq[Expression], - dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { + override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { + + val convertedPartitionFilters = + HoodieFileIndex.convertFilterForTimestampKeyGenerator(metaClient, partitionFilters) + // Look up candidate files names in the col-stats index, if all of the following conditions are true // - Data-skipping is enabled // - Col-Stats Index is present // - List of predicates (filters) is present val candidateFilesNamesOpt: Option[Set[String]] = - lookupCandidateFilesInColStatsIndex(dataFilters) match { - case Success(opt) => opt - case Failure(e) => - if (e.isInstanceOf[AnalysisException]) { - logDebug("Failed to relay provided data filters to Z-index lookup", e) - } else { - logError("Failed to lookup candidate files in Z-index", e) - } - Option.empty - } + lookupCandidateFilesInColStatsIndex(dataFilters) match { + case Success(opt) => opt + case Failure(e) => + if (e.isInstanceOf[AnalysisException]) { + logDebug("Failed to relay provided data filters to Z-index lookup", e) + } else { + logError("Failed to lookup candidate files in Z-index", e) + } + Option.empty + } logDebug(s"Overlapping candidate files (from Z-index): ${candidateFilesNamesOpt.getOrElse(Set.empty)}") if (queryAsNonePartitionedTable) { // Read as Non-Partitioned table // Filter in candidate files based on the col-stats index lookup - val candidateFiles = - allFiles.filter(fileStatus => + val candidateFiles = allFiles.filter(fileStatus => // NOTE: This predicate is true when {@code Option} is empty candidateFilesNamesOpt.forall(_.contains(fileStatus.getPath.getName)) ) @@ -137,22 +149,21 @@ case class HoodieFileIndex(spark: SparkSession, Seq(PartitionDirectory(InternalRow.empty, candidateFiles)) } else { // Prune the partition path by the partition filters - val prunedPartitions = prunePartition(cachedAllInputFileSlices.keys.toSeq, partitionFilters) + val prunedPartitions = prunePartition(cachedAllInputFileSlices.keySet.asScala.toSeq, convertedPartitionFilters) var totalFileSize = 0 var candidateFileSize = 0 val result = prunedPartitions.map { partition => val baseFileStatuses: Seq[FileStatus] = - cachedAllInputFileSlices(partition) + cachedAllInputFileSlices.get(partition).asScala .map(fs => fs.getBaseFile.orElse(null)) .filter(_ != null) .map(_.getFileStatus) // Filter in candidate files based on the col-stats index lookup - val candidateFiles = - baseFileStatuses.filter(fs => - // NOTE: This predicate is true when {@code Option} is empty - candidateFilesNamesOpt.forall(_.contains(fs.getPath.getName))) + val candidateFiles = baseFileStatuses.filter(fs => + // NOTE: This predicate is true when {@code Option} is empty + candidateFilesNamesOpt.forall(_.contains(fs.getPath.getName))) totalFileSize += baseFileStatuses.size candidateFileSize += candidateFiles.size @@ -194,12 +205,14 @@ case class HoodieFileIndex(spark: SparkSession, // scalastyle:on return } + val completedCommits = getActiveTimeline.filterCompletedInstants().getInstants.iterator.asScala.toList.map(_.getTimestamp) + // Collect all index tables present in `.zindex` folder val candidateIndexTables = fs.listStatus(new Path(indexPath)) .filter(_.isDirectory) .map(_.getPath.getName) - .filter(f => completedCommits.contains(f)) + .filter(completedCommits.contains(_)) .sortBy(x => x) if (candidateIndexTables.isEmpty) { @@ -267,7 +280,7 @@ case class HoodieFileIndex(spark: SparkSession, } } -object HoodieFileIndex { +object HoodieFileIndex extends Logging { def getConfigProperties(spark: SparkSession, options: Map[String, String]) = { val sqlConf: SQLConf = spark.sessionState.conf @@ -282,6 +295,41 @@ object HoodieFileIndex { properties } + def convertFilterForTimestampKeyGenerator(metaClient: HoodieTableMetaClient, + partitionFilters: Seq[Expression]): Seq[Expression] = { + + val tableConfig = metaClient.getTableConfig + val keyGenerator = tableConfig.getKeyGeneratorClassName + + if (keyGenerator != null && (keyGenerator.equals(classOf[TimestampBasedKeyGenerator].getCanonicalName) || + keyGenerator.equals(classOf[TimestampBasedAvroKeyGenerator].getCanonicalName))) { + val inputFormat = tableConfig.getString(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP) + val outputFormat = tableConfig.getString(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP) + if (StringUtils.isNullOrEmpty(inputFormat) || StringUtils.isNullOrEmpty(outputFormat) || + inputFormat.equals(outputFormat)) { + partitionFilters + } else { + try { + val inDateFormat = new SimpleDateFormat(inputFormat) + val outDateFormat = new SimpleDateFormat(outputFormat) + partitionFilters.toArray.map { + _.transformDown { + case Literal(value, dataType) if dataType.isInstanceOf[StringType] => + val converted = outDateFormat.format(inDateFormat.parse(value.toString)) + Literal(UTF8String.fromString(converted), StringType) + } + } + } catch { + case NonFatal(e) => + logWarning("Fail to convert filters for TimestampBaseAvroKeyGenerator.") + partitionFilters + } + } + } else { + partitionFilters + } + } + private def getQueryPath(options: Map[String, String]) = { new Path(options.getOrElse("path", "'path' option required")) } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileScanRDD.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileScanRDD.scala new file mode 100644 index 0000000000000..9f2d7d9e0380a --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileScanRDD.scala @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.spark.{Partition, TaskContext} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} +import org.apache.spark.sql.execution.QueryExecutionException +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile, SchemaColumnConvertNotSupportedException} +import org.apache.spark.sql.types.StructType + +/** + * Similar to [[org.apache.spark.sql.execution.datasources.FileScanRDD]]. + * + * This class will extract the fields needed according to [[requiredColumns]] and + * return iterator of [[org.apache.spark.sql.Row]] directly. + */ +class HoodieFileScanRDD( + @transient private val sparkSession: SparkSession, + requiredColumns: Array[String], + schema: StructType, + readFunction: PartitionedFile => Iterator[InternalRow], + @transient val filePartitions: Seq[FilePartition]) + extends RDD[Row](sparkSession.sparkContext, Nil) { + + private val requiredSchema = { + val nameToStructField = schema.map(field => (field.name, field)).toMap + StructType(requiredColumns.map(nameToStructField)) + } + + private val requiredFieldPos = HoodieSparkUtils.collectFieldIndexes(requiredSchema, schema) + + override def compute(split: Partition, context: TaskContext): Iterator[Row] = { + val iterator = new Iterator[Object] with AutoCloseable { + + private[this] val files = split.asInstanceOf[FilePartition].files.toIterator + private[this] var currentFile: PartitionedFile = null + private[this] var currentIterator: Iterator[Object] = null + + override def hasNext: Boolean = { + (currentIterator != null && currentIterator.hasNext) || nextIterator() + } + + def next(): Object = { + currentIterator.next() + } + + /** Advances to the next file. Returns true if a new non-empty iterator is available. */ + private def nextIterator(): Boolean = { + if (files.hasNext) { + currentFile = files.next() + + logInfo(s"Reading File $currentFile") + currentIterator = readFunction(currentFile) + + try { + hasNext + } catch { + case e: SchemaColumnConvertNotSupportedException => + val message = "Parquet column cannot be converted in " + + s"file ${currentFile.filePath}. Column: ${e.getColumn}, " + + s"Expected: ${e.getLogicalType}, Found: ${e.getPhysicalType}" + throw new QueryExecutionException(message, e) + + case e => throw e + } + } else { + currentFile = null + false + } + } + + override def close(): Unit = {} + } + + // Register an on-task-completion callback to close the input stream. + context.addTaskCompletionListener[Unit](_ => iterator.close()) + + // extract required columns from row + val iterAfterExtract = HoodieDataSourceHelper.extractRequiredSchema( + iterator.asInstanceOf[Iterator[InternalRow]], + requiredSchema, + requiredFieldPos) + + // convert InternalRow to Row and return + val converter = CatalystTypeConverters.createToScalaConverter(requiredSchema) + iterAfterExtract.map(converter(_).asInstanceOf[Row]) + } + + override protected def getPartitions: Array[Partition] = filePartitions.toArray + +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala index 226fb01f43f90..96fe47e0219d4 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieMergeOnReadRDD.scala @@ -21,22 +21,28 @@ package org.apache.hudi import org.apache.avro.Schema import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder} import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.HoodieDataSourceHelper._ +import org.apache.hudi.HoodieMergeOnReadRDD.resolveAvroSchemaNullability +import org.apache.hudi.MergeOnReadSnapshotRelation.getFilePath +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.engine.HoodieLocalEngineContext import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner import org.apache.hudi.config.HoodiePayloadConfig import org.apache.hudi.exception.HoodieException import org.apache.hudi.hadoop.config.HoodieRealtimeConfig -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS +import org.apache.hudi.metadata.HoodieTableMetadata.getDataTableBasePathFromMetadataTable +import org.apache.hudi.metadata.{HoodieBackedTableMetadata, HoodieTableMetadata} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.avro.{HoodieAvroSerializer, HoodieAvroDeserializer} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeProjection} +import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.{Partition, SerializableWritable, SparkContext, TaskContext} import java.io.Closeable - +import java.util.Properties import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.Try @@ -45,43 +51,40 @@ case class HoodieMergeOnReadPartition(index: Int, split: HoodieMergeOnReadFileSp class HoodieMergeOnReadRDD(@transient sc: SparkContext, @transient config: Configuration, - fullSchemaFileReader: PartitionedFile => Iterator[Any], - requiredSchemaFileReader: PartitionedFile => Iterator[Any], - tableState: HoodieMergeOnReadTableState) + fullSchemaFileReader: PartitionedFile => Iterator[InternalRow], + requiredSchemaFileReader: PartitionedFile => Iterator[InternalRow], + tableState: HoodieMergeOnReadTableState, + tableSchema: HoodieTableSchema, + requiredSchema: HoodieTableSchema) extends RDD[InternalRow](sc, Nil) { private val confBroadcast = sc.broadcast(new SerializableWritable(config)) - private val preCombineField = tableState.preCombineField - private val recordKeyFieldOpt = tableState.recordKeyFieldOpt - private val payloadProps = if (preCombineField.isDefined) { - Some(HoodiePayloadConfig.newBuilder.withPayloadOrderingField(preCombineField.get).build.getProps) - } else { - None - } + private val recordKeyField = tableState.recordKeyField + private val payloadProps = tableState.preCombineFieldOpt + .map(preCombineField => + HoodiePayloadConfig.newBuilder + .withPayloadOrderingField(preCombineField) + .build + .getProps + ) + .getOrElse(new Properties()) + override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { val mergeOnReadPartition = split.asInstanceOf[HoodieMergeOnReadPartition] val iter = mergeOnReadPartition.split match { - case dataFileOnlySplit if dataFileOnlySplit.logPaths.isEmpty => - read(dataFileOnlySplit.dataFile.get, requiredSchemaFileReader) + case dataFileOnlySplit if dataFileOnlySplit.logFiles.isEmpty => + requiredSchemaFileReader(dataFileOnlySplit.dataFile.get) case logFileOnlySplit if logFileOnlySplit.dataFile.isEmpty => logFileIterator(logFileOnlySplit, getConfig) - case skipMergeSplit if skipMergeSplit.mergeType - .equals(DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) => - skipMergeFileIterator( - skipMergeSplit, - read(skipMergeSplit.dataFile.get, requiredSchemaFileReader), - getConfig - ) - case payloadCombineSplit if payloadCombineSplit.mergeType - .equals(DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL) => - payloadCombineFileIterator( - payloadCombineSplit, - read(payloadCombineSplit.dataFile.get, fullSchemaFileReader), - getConfig - ) + case skipMergeSplit if skipMergeSplit.mergeType.equals(DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL) => + skipMergeFileIterator(skipMergeSplit, requiredSchemaFileReader(skipMergeSplit.dataFile.get), getConfig) + case payloadCombineSplit + if payloadCombineSplit.mergeType.equals(DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL) => + payloadCombineFileIterator(payloadCombineSplit, fullSchemaFileReader(payloadCombineSplit.dataFile.get), + getConfig) case _ => throw new HoodieException(s"Unable to select an Iterator to read the Hoodie MOR File Split for " + s"file path: ${mergeOnReadPartition.split.dataFile.get.filePath}" + - s"log paths: ${mergeOnReadPartition.split.logPaths.toString}" + + s"log paths: ${mergeOnReadPartition.split.logFiles.toString}" + s"hoodie table path: ${mergeOnReadPartition.split.tablePath}" + s"spark partition Index: ${mergeOnReadPartition.index}" + s"merge type: ${mergeOnReadPartition.split.mergeType}") @@ -108,43 +111,35 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, } } - private def read(partitionedFile: PartitionedFile, - readFileFunction: PartitionedFile => Iterator[Any]): Iterator[InternalRow] = { - val fileIterator = readFileFunction(partitionedFile) - val rows = fileIterator.flatMap(_ match { - case r: InternalRow => Seq(r) - case b: ColumnarBatch => b.rowIterator().asScala - }) - rows - } - private def logFileIterator(split: HoodieMergeOnReadFileSplit, config: Configuration): Iterator[InternalRow] = - new Iterator[InternalRow] with Closeable { - private val tableAvroSchema = new Schema.Parser().parse(tableState.tableAvroSchema) - private val requiredAvroSchema = new Schema.Parser().parse(tableState.requiredAvroSchema) + new Iterator[InternalRow] with Closeable with SparkAdapterSupport { + private val tableAvroSchema = new Schema.Parser().parse(tableSchema.avroSchemaStr) + private val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) private val requiredFieldPosition = - tableState.requiredStructSchema + requiredSchema.structTypeSchema .map(f => tableAvroSchema.getField(f.name).pos()).toList private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema) - private val deserializer = HoodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) - private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema) + private val deserializer = sparkAdapter.createAvroDeserializer(requiredAvroSchema, requiredSchema.structTypeSchema) + private val unsafeProjection = UnsafeProjection.create(requiredSchema.structTypeSchema) private var logScanner = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config) private val logRecords = logScanner.getRecords private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala private var recordToLoad: InternalRow = _ + override def hasNext: Boolean = { if (logRecordsKeyIterator.hasNext) { val curAvrokey = logRecordsKeyIterator.next() - val curAvroRecord = logRecords.get(curAvrokey).getData.getInsertValue(tableAvroSchema) + val curAvroRecord = logRecords.get(curAvrokey).getData.getInsertValue(tableAvroSchema, payloadProps) if (!curAvroRecord.isPresent) { // delete record found, skipping this.hasNext } else { - val requiredAvroRecord = AvroConversionUtils - .buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, requiredFieldPosition, recordBuilder) - recordToLoad = unsafeProjection(deserializer.deserializeData(requiredAvroRecord).asInstanceOf[InternalRow]) + val requiredAvroRecord = AvroConversionUtils.buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, + requiredFieldPosition, recordBuilder) + val rowOpt = deserializer.deserialize(requiredAvroRecord) + recordToLoad = unsafeProjection(rowOpt.get.asInstanceOf[InternalRow]) true } } else { @@ -170,15 +165,15 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, private def skipMergeFileIterator(split: HoodieMergeOnReadFileSplit, baseFileIterator: Iterator[InternalRow], config: Configuration): Iterator[InternalRow] = - new Iterator[InternalRow] with Closeable { - private val tableAvroSchema = new Schema.Parser().parse(tableState.tableAvroSchema) - private val requiredAvroSchema = new Schema.Parser().parse(tableState.requiredAvroSchema) + new Iterator[InternalRow] with Closeable with SparkAdapterSupport { + private val tableAvroSchema = new Schema.Parser().parse(tableSchema.avroSchemaStr) + private val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) private val requiredFieldPosition = - tableState.requiredStructSchema + requiredSchema.structTypeSchema .map(f => tableAvroSchema.getField(f.name).pos()).toList private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema) - private val deserializer = HoodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) - private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema) + private val deserializer = sparkAdapter.createAvroDeserializer(requiredAvroSchema, requiredSchema.structTypeSchema) + private val unsafeProjection = UnsafeProjection.create(requiredSchema.structTypeSchema) private var logScanner = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config) private val logRecords = logScanner.getRecords private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala @@ -188,19 +183,21 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, @scala.annotation.tailrec override def hasNext: Boolean = { if (baseFileIterator.hasNext) { - recordToLoad = baseFileIterator.next() + val curRow = baseFileIterator.next() + recordToLoad = unsafeProjection(curRow) true } else { if (logRecordsKeyIterator.hasNext) { val curAvrokey = logRecordsKeyIterator.next() - val curAvroRecord = logRecords.get(curAvrokey).getData.getInsertValue(tableAvroSchema) + val curAvroRecord = logRecords.get(curAvrokey).getData.getInsertValue(tableAvroSchema, payloadProps) if (!curAvroRecord.isPresent) { // delete record found, skipping this.hasNext } else { - val requiredAvroRecord = AvroConversionUtils - .buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, requiredFieldPosition, recordBuilder) - recordToLoad = unsafeProjection(deserializer.deserializeData(requiredAvroRecord).asInstanceOf[InternalRow]) + val requiredAvroRecord = AvroConversionUtils.buildAvroRecordBySchema(curAvroRecord.get(), requiredAvroSchema, + requiredFieldPosition, recordBuilder) + val rowOpt = deserializer.deserialize(requiredAvroRecord) + recordToLoad = unsafeProjection(rowOpt.get.asInstanceOf[InternalRow]) true } } else { @@ -227,21 +224,22 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, private def payloadCombineFileIterator(split: HoodieMergeOnReadFileSplit, baseFileIterator: Iterator[InternalRow], config: Configuration): Iterator[InternalRow] = - new Iterator[InternalRow] with Closeable { - private val tableAvroSchema = new Schema.Parser().parse(tableState.tableAvroSchema) - private val requiredAvroSchema = new Schema.Parser().parse(tableState.requiredAvroSchema) + new Iterator[InternalRow] with Closeable with SparkAdapterSupport { + private val tableAvroSchema = new Schema.Parser().parse(tableSchema.avroSchemaStr) + private val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) private val requiredFieldPosition = - tableState.requiredStructSchema + requiredSchema.structTypeSchema .map(f => tableAvroSchema.getField(f.name).pos()).toList - private val serializer = HoodieAvroSerializer(tableState.tableStructSchema, tableAvroSchema, false) - private val requiredDeserializer = HoodieAvroDeserializer(requiredAvroSchema, tableState.requiredStructSchema) + private val serializer = sparkAdapter.createAvroSerializer(tableSchema.structTypeSchema, tableAvroSchema, + resolveAvroSchemaNullability(tableAvroSchema)) + private val requiredDeserializer = sparkAdapter.createAvroDeserializer(requiredAvroSchema, requiredSchema.structTypeSchema) private val recordBuilder = new GenericRecordBuilder(requiredAvroSchema) - private val unsafeProjection = UnsafeProjection.create(tableState.requiredStructSchema) + private val unsafeProjection = UnsafeProjection.create(requiredSchema.structTypeSchema) private var logScanner = HoodieMergeOnReadRDD.scanLog(split, tableAvroSchema, config) private val logRecords = logScanner.getRecords private val logRecordsKeyIterator = logRecords.keySet().iterator().asScala private val keyToSkip = mutable.Set.empty[String] - private val recordKeyPosition = if (recordKeyFieldOpt.isEmpty) HOODIE_RECORD_KEY_COL_POS else tableState.tableStructSchema.fieldIndex(recordKeyFieldOpt.get) + private val recordKeyPosition = tableSchema.structTypeSchema.fieldIndex(recordKeyField) private var recordToLoad: InternalRow = _ @@ -259,20 +257,15 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, this.hasNext } else { // load merged record as InternalRow with required schema - val requiredAvroRecord = AvroConversionUtils - .buildAvroRecordBySchema( - mergedAvroRecord.get(), - requiredAvroSchema, - requiredFieldPosition, - recordBuilder - ) - recordToLoad = unsafeProjection(requiredDeserializer - .deserializeData(requiredAvroRecord).asInstanceOf[InternalRow]) + val requiredAvroRecord = AvroConversionUtils.buildAvroRecordBySchema(mergedAvroRecord.get(), requiredAvroSchema, + requiredFieldPosition, recordBuilder) + val rowOpt = requiredDeserializer.deserialize(requiredAvroRecord) + recordToLoad = unsafeProjection(rowOpt.get.asInstanceOf[InternalRow]) true } } else { // No merge needed, load current row with required schema - recordToLoad = unsafeProjection(createRowWithRequiredSchema(curRow)) + recordToLoad = unsafeProjection(createInternalRowWithSchema(curRow, requiredSchema.structTypeSchema, requiredFieldPosition)) true } } else { @@ -281,8 +274,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, if (keyToSkip.contains(curKey)) { this.hasNext } else { - val insertAvroRecord = - logRecords.get(curKey).getData.getInsertValue(tableAvroSchema) + val insertAvroRecord = logRecords.get(curKey).getData.getInsertValue(tableAvroSchema, payloadProps) if (!insertAvroRecord.isPresent) { // stand alone delete record, skipping this.hasNext @@ -294,8 +286,8 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, requiredFieldPosition, recordBuilder ) - recordToLoad = unsafeProjection(requiredDeserializer - .deserializeData(requiredAvroRecord).asInstanceOf[InternalRow]) + val rowOpt = requiredDeserializer.deserialize(requiredAvroRecord) + recordToLoad = unsafeProjection(rowOpt.get.asInstanceOf[InternalRow]) true } } @@ -317,29 +309,10 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext, } } - private def createRowWithRequiredSchema(row: InternalRow): InternalRow = { - val rowToReturn = new SpecificInternalRow(tableState.requiredStructSchema) - val posIterator = requiredFieldPosition.iterator - var curIndex = 0 - tableState.requiredStructSchema.foreach( - f => { - val curPos = posIterator.next() - val curField = if (row.isNullAt(curPos)) null else row.get(curPos, f.dataType) - rowToReturn.update(curIndex, curField) - curIndex = curIndex + 1 - } - ) - rowToReturn - } - private def mergeRowWithLog(curRow: InternalRow, curKey: String) = { val historyAvroRecord = serializer.serialize(curRow).asInstanceOf[GenericRecord] - if (payloadProps.isDefined) { - logRecords.get(curKey).getData.combineAndGetUpdateValue(historyAvroRecord, - tableAvroSchema, payloadProps.get) - } else { - logRecords.get(curKey).getData.combineAndGetUpdateValue(historyAvroRecord, tableAvroSchema) - } + logRecords.get(curKey).getData + .combineAndGetUpdateValue(historyAvroRecord, tableAvroSchema, payloadProps) } } } @@ -349,24 +322,60 @@ private object HoodieMergeOnReadRDD { def scanLog(split: HoodieMergeOnReadFileSplit, logSchema: Schema, config: Configuration): HoodieMergedLogRecordScanner = { val fs = FSUtils.getFs(split.tablePath, config) - HoodieMergedLogRecordScanner.newBuilder() - .withFileSystem(fs) - .withBasePath(split.tablePath) - .withLogFilePaths(split.logPaths.get.asJava) - .withReaderSchema(logSchema) - .withLatestInstantTime(split.latestCommit) - .withReadBlocksLazily( - Try(config.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, - HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED).toBoolean) - .getOrElse(false)) - .withReverseReader(false) - .withBufferSize( - config.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, - HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) - .withMaxMemorySizeInBytes(split.maxCompactionMemoryInBytes) - .withSpillableMapBasePath( - config.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, - HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) - .build() + val logFiles = split.logFiles.get + + if (HoodieTableMetadata.isMetadataTable(split.tablePath)) { + val metadataConfig = HoodieMetadataConfig.newBuilder().enable(true).build() + val dataTableBasePath = getDataTableBasePathFromMetadataTable(split.tablePath) + val metadataTable = new HoodieBackedTableMetadata( + new HoodieLocalEngineContext(config), metadataConfig, + dataTableBasePath, + config.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) + + // NOTE: In case of Metadata Table partition path equates to partition name (since there's just one level + // of indirection among MT partitions) + val relativePartitionPath = getRelativePartitionPath(new Path(split.tablePath), getPartitionPath(split)) + metadataTable.getLogRecordScanner(logFiles.asJava, relativePartitionPath).getLeft + } else { + val logRecordScannerBuilder = HoodieMergedLogRecordScanner.newBuilder() + .withFileSystem(fs) + .withBasePath(split.tablePath) + .withLogFilePaths(split.logFiles.get.map(logFile => getFilePath(logFile.getPath)).asJava) + .withReaderSchema(logSchema) + .withLatestInstantTime(split.latestCommit) + .withReadBlocksLazily( + Try(config.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, + HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED).toBoolean) + .getOrElse(false)) + .withReverseReader(false) + .withBufferSize( + config.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, + HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE)) + .withMaxMemorySizeInBytes(split.maxCompactionMemoryInBytes) + .withSpillableMapBasePath( + config.get(HoodieRealtimeConfig.SPILLABLE_MAP_BASE_PATH_PROP, + HoodieRealtimeConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH)) + + if (logFiles.nonEmpty) { + logRecordScannerBuilder.withPartition(getRelativePartitionPath(new Path(split.tablePath), logFiles.head.getPath.getParent)) + } + + logRecordScannerBuilder.build() + } + } + + private def getPartitionPath(split: HoodieMergeOnReadFileSplit): Path = { + // Determine partition path as an immediate parent folder of either + // - The base file + // - Some log file + split.dataFile.map(baseFile => new Path(baseFile.filePath)) + .getOrElse(split.logFiles.get.head.getPath) + .getParent + } + + private def resolveAvroSchemaNullability(schema: Schema) = { + AvroConversionUtils.resolveAvroTypeNullability(schema) match { + case (nullable, _) => nullable + } } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 1f2aae4119c55..6b6ddc38e3039 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -19,6 +19,7 @@ package org.apache.hudi import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord +import org.apache.avro.reflect.AvroSchema import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf @@ -28,7 +29,7 @@ import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.client.{HoodieWriteResult, SparkRDDWriteClient} import org.apache.hudi.common.config.{HoodieConfig, HoodieMetadataConfig, TypedProperties} import org.apache.hudi.common.fs.FSUtils -import org.apache.hudi.common.model.{HoodieRecordPayload, HoodieTableType, HoodieTimelineTimeZone, WriteOperationType} +import org.apache.hudi.common.model._ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.common.util.{CommitUtils, ReflectionUtils, StringUtils} @@ -39,19 +40,21 @@ import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRo import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} import org.apache.hudi.index.SparkHoodieIndexFactory import org.apache.hudi.internal.DataSourceInternalWriterHelper +import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory import org.apache.hudi.sync.common.AbstractSyncTool import org.apache.hudi.table.BulkInsertPartitioner import org.apache.log4j.LogManager +import org.apache.spark.SPARK_VERSION import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SaveMode, SparkSession} +import org.apache.spark.sql._ import org.apache.spark.SparkContext import java.util.Properties - import scala.collection.JavaConversions._ import scala.collection.mutable import scala.collection.mutable.ListBuffer @@ -85,6 +88,9 @@ object HoodieSparkSqlWriter { validateTableConfig(sqlContext.sparkSession, optParams, tableConfig) val (parameters, hoodieConfig) = mergeParamsAndGetHoodieConfig(optParams, tableConfig) + val originKeyGeneratorClassName = HoodieWriterUtils.getOriginKeyGenerator(parameters) + val timestampKeyGeneratorConfigs = extractConfigsRelatedToTimestmapBasedKeyGenerator( + originKeyGeneratorClassName, parameters) val databaseName = hoodieConfig.getStringOrDefault(HoodieTableConfig.DATABASE_NAME, "") val tblName = hoodieConfig.getStringOrThrow(HoodieWriteConfig.TBL_NAME, s"'${HoodieWriteConfig.TBL_NAME.key}' must be set.").trim @@ -113,6 +119,11 @@ object HoodieSparkSqlWriter { } val jsc = new JavaSparkContext(sparkContext) + if (asyncCompactionTriggerFn.isDefined) { + if (jsc.getConf.getOption(SparkConfigs.SPARK_SCHEDULER_ALLOCATION_FILE_KEY).isDefined) { + jsc.setLocalProperty("spark.scheduler.pool", SparkConfigs.SPARK_DATASOURCE_WRITER_POOL_NAME) + } + } val instantTime = HoodieActiveTimeline.createNewInstantTime() val keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(hoodieConfig.getProps)) @@ -142,7 +153,8 @@ object HoodieSparkSqlWriter { .setPartitionFields(partitionColumns) .setPopulateMetaFields(populateMetaFields) .setRecordKeyFields(hoodieConfig.getString(RECORDKEY_FIELD)) - .setKeyGeneratorClassProp(HoodieWriterUtils.getOriginKeyGenerator(parameters)) + .setKeyGeneratorClassProp(originKeyGeneratorClassName) + .set(timestampKeyGeneratorConfigs) .setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING)) .setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING)) .setCommitTimezone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getStringOrDefault(HoodieTableConfig.TIMELINE_TIMEZONE))) @@ -226,6 +238,7 @@ object HoodieSparkSqlWriter { if (reconcileSchema) { schema = getLatestTableSchema(fs, basePath, sparkContext, schema) } + validateSchemaForHoodieIsDeleted(schema) sparkContext.getConf.registerAvroSchemas(schema) log.info(s"Registered avro schema : ${schema.toString(true)}") @@ -244,7 +257,8 @@ object HoodieSparkSqlWriter { DataSourceWriteOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()).toBoolean) .asInstanceOf[Comparable[_]] DataSourceUtils.createHoodieRecord(processedRecord, - orderingVal, keyGenerator.getKey(gr), + orderingVal, + keyGenerator.getKey(gr), hoodieConfig.getString(PAYLOAD_CLASS_NAME)) } else { DataSourceUtils.createHoodieRecord(processedRecord, keyGenerator.getKey(gr), hoodieConfig.getString(PAYLOAD_CLASS_NAME)) @@ -366,51 +380,62 @@ object HoodieSparkSqlWriter { schema = HoodieAvroUtils.getNullSchema.toString } - // Handle various save modes if (mode == SaveMode.Ignore && tableExists) { log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") + if (!hoodieWriteClient.isEmpty) { + hoodieWriteClient.get.close() + } false } else { + // Handle various save modes handleSaveModes(sqlContext.sparkSession, mode, basePath, tableConfig, tableName, WriteOperationType.BOOTSTRAP, fs) - } - if (!tableExists) { - val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.ARCHIVELOG_FOLDER) - val partitionColumns = HoodieWriterUtils.getPartitionColumns(parameters) - val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD) - val keyGenProp = hoodieConfig.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME) - val populateMetaFields = parameters.getOrElse(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()).toBoolean - val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieTableConfig.BASE_FILE_FORMAT) - - HoodieTableMetaClient.withPropertyBuilder() - .setTableType(HoodieTableType.valueOf(tableType)) - .setTableName(tableName) - .setRecordKeyFields(recordKeyFields) - .setArchiveLogFolder(archiveLogFolder) - .setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_NAME)) - .setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD, null)) - .setBootstrapIndexClass(bootstrapIndexClass) - .setBaseFileFormat(baseFileFormat) - .setBootstrapBasePath(bootstrapBasePath) - .setPartitionFields(partitionColumns) - .setPopulateMetaFields(populateMetaFields) - .setKeyGeneratorClassProp(keyGenProp) - .setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING)) - .setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING)) - .setCommitTimezone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getStringOrDefault(HoodieTableConfig.TIMELINE_TIMEZONE))) - .initTable(sparkContext.hadoopConfiguration, path) + if (!tableExists) { + val archiveLogFolder = hoodieConfig.getStringOrDefault(HoodieTableConfig.ARCHIVELOG_FOLDER) + val partitionColumns = HoodieWriterUtils.getPartitionColumns(parameters) + val recordKeyFields = hoodieConfig.getString(DataSourceWriteOptions.RECORDKEY_FIELD) + val keyGenProp = hoodieConfig.getString(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME) + val populateMetaFields = parameters.getOrElse(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()).toBoolean + val baseFileFormat = hoodieConfig.getStringOrDefault(HoodieTableConfig.BASE_FILE_FORMAT) + + HoodieTableMetaClient.withPropertyBuilder() + .setTableType(HoodieTableType.valueOf(tableType)) + .setTableName(tableName) + .setRecordKeyFields(recordKeyFields) + .setArchiveLogFolder(archiveLogFolder) + .setPayloadClassName(hoodieConfig.getStringOrDefault(PAYLOAD_CLASS_NAME)) + .setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD, null)) + .setBootstrapIndexClass(bootstrapIndexClass) + .setBaseFileFormat(baseFileFormat) + .setBootstrapBasePath(bootstrapBasePath) + .setPartitionFields(partitionColumns) + .setPopulateMetaFields(populateMetaFields) + .setKeyGeneratorClassProp(keyGenProp) + .setHiveStylePartitioningEnable(hoodieConfig.getBoolean(HIVE_STYLE_PARTITIONING)) + .setUrlEncodePartitioning(hoodieConfig.getBoolean(URL_ENCODE_PARTITIONING)) + .setCommitTimezone(HoodieTimelineTimeZone.valueOf(hoodieConfig.getStringOrDefault(HoodieTableConfig.TIMELINE_TIMEZONE))) + .initTable(sparkContext.hadoopConfiguration, path) + } + + val jsc = new JavaSparkContext(sqlContext.sparkContext) + val writeClient = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, + schema, path, tableName, mapAsJavaMap(parameters))) + try { + writeClient.bootstrap(org.apache.hudi.common.util.Option.empty()) + } finally { + writeClient.close() + } + val metaSyncSuccess = metaSync(sqlContext.sparkSession, hoodieConfig, basePath, df.schema) + metaSyncSuccess } + } - val jsc = new JavaSparkContext(sqlContext.sparkContext) - val writeClient = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, - schema, path, tableName, mapAsJavaMap(parameters))) - try { - writeClient.bootstrap(org.apache.hudi.common.util.Option.empty()) - } finally { - writeClient.close() + def validateSchemaForHoodieIsDeleted(schema: Schema): Unit = { + if (schema.getField(HoodieRecord.HOODIE_IS_DELETED) != null && + AvroConversionUtils.resolveAvroTypeNullability(schema.getField(HoodieRecord.HOODIE_IS_DELETED).schema())._2.getType != Schema.Type.BOOLEAN) { + throw new HoodieException(HoodieRecord.HOODIE_IS_DELETED + " has to be BOOLEAN type. Passed in dataframe's schema has type " + + schema.getField(HoodieRecord.HOODIE_IS_DELETED).schema().getType) } - val metaSyncSuccess = metaSync(sqlContext.sparkSession, hoodieConfig, basePath, df.schema) - metaSyncSuccess } def bulkInsertAsRow(sqlContext: SQLContext, @@ -435,6 +460,7 @@ object HoodieSparkSqlWriter { if (dropPartitionColumns) { schema = generateSchemaWithoutPartitionColumns(partitionColumns, schema) } + validateSchemaForHoodieIsDeleted(schema) sparkContext.getConf.registerAvroSchemas(schema) log.info(s"Registered avro schema : ${schema.toString(true)}") if (parameters(INSERT_DROP_DUPS.key).toBoolean) { @@ -535,6 +561,9 @@ object HoodieSparkSqlWriter { val hiveSyncConfig: HiveSyncConfig = buildSyncConfig(basePath, hoodieConfig, sqlConf) val hiveConf: HiveConf = new HiveConf() hiveConf.addResource(fs.getConf) + if (StringUtils.isNullOrEmpty(hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname))) { + hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, hiveSyncConfig.metastoreUris) + } new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable() true } @@ -550,6 +579,7 @@ object HoodieSparkSqlWriter { hiveSyncConfig.hiveUser = hoodieConfig.getString(HIVE_USER) hiveSyncConfig.hivePass = hoodieConfig.getString(HIVE_PASS) hiveSyncConfig.jdbcUrl = hoodieConfig.getString(HIVE_URL) + hiveSyncConfig.metastoreUris = hoodieConfig.getStringOrDefault(METASTORE_URIS) hiveSyncConfig.skipROSuffix = hoodieConfig.getStringOrDefault(HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE, DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE.defaultValue).toBoolean hiveSyncConfig.partitionFields = @@ -569,6 +599,7 @@ object HoodieSparkSqlWriter { hiveSyncConfig.syncMode = hoodieConfig.getString(HIVE_SYNC_MODE) hiveSyncConfig.serdeProperties = hoodieConfig.getString(HIVE_TABLE_SERDE_PROPERTIES) hiveSyncConfig.tableProperties = hoodieConfig.getString(HIVE_TABLE_PROPERTIES) + hiveSyncConfig.sparkVersion = SPARK_VERSION hiveSyncConfig } @@ -730,7 +761,22 @@ object HoodieSparkSqlWriter { mergedParams(key) = value } } + + // use preCombineField to fill in PAYLOAD_ORDERING_FIELD_PROP_KEY + if (mergedParams.contains(PRECOMBINE_FIELD.key())) { + mergedParams.put(HoodiePayloadProps.PAYLOAD_ORDERING_FIELD_PROP_KEY, mergedParams(PRECOMBINE_FIELD.key())) + } val params = mergedParams.toMap (params, HoodieWriterUtils.convertMapToHoodieConfig(params)) } + + private def extractConfigsRelatedToTimestmapBasedKeyGenerator(keyGenerator: String, + params: Map[String, String]): Map[String, String] = { + if (keyGenerator.equals(classOf[TimestampBasedKeyGenerator].getCanonicalName) || + keyGenerator.equals(classOf[TimestampBasedAvroKeyGenerator].getCanonicalName)) { + params.filterKeys(HoodieTableConfig.PERSISTED_CONFIG_LIST.contains) + } else { + Map.empty + } + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala index 9a940ebcebf02..8a4ad9d85d72d 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieWriterUtils.scala @@ -72,6 +72,7 @@ object HoodieWriterUtils { hoodieConfig.setDefaultValue(HIVE_USER) hoodieConfig.setDefaultValue(HIVE_PASS) hoodieConfig.setDefaultValue(HIVE_URL) + hoodieConfig.setDefaultValue(METASTORE_URIS) hoodieConfig.setDefaultValue(HIVE_PARTITION_FIELDS) hoodieConfig.setDefaultValue(HIVE_PARTITION_EXTRACTOR_CLASS) hoodieConfig.setDefaultValue(HIVE_STYLE_PARTITIONING) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala index 19071080312bc..9247973e78fc0 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/IncrementalRelation.scala @@ -18,16 +18,17 @@ package org.apache.hudi import org.apache.avro.Schema +import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieRecord, HoodieReplaceCommitMetadata} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import java.util.stream.Collectors -import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieRecord, HoodieReplaceCommitMetadata, HoodieTableType} -import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hadoop.fs.{GlobPattern, Path} +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} +import org.apache.hudi.common.util.HoodieTimer import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.exception.HoodieException -import org.apache.hadoop.fs.GlobPattern -import org.apache.hudi.client.common.HoodieSparkEngineContext -import org.apache.hudi.common.fs.FSUtils import org.apache.hudi.table.HoodieSparkTable import org.apache.log4j.LogManager import org.apache.spark.api.java.JavaSparkContext @@ -40,14 +41,14 @@ import scala.collection.JavaConversions._ import scala.collection.mutable /** - * Relation, that implements the Hoodie incremental view. - * - * Implemented for Copy_on_write storage. - * - */ + * Relation, that implements the Hoodie incremental view. + * + * Implemented for Copy_on_write storage. + * + */ class IncrementalRelation(val sqlContext: SQLContext, val optParams: Map[String, String], - val userSchema: StructType, + val userSchema: Option[StructType], val metaClient: HoodieTableMetaClient) extends BaseRelation with TableScan { private val log = LogManager.getLogger(classOf[IncrementalRelation]) @@ -85,7 +86,7 @@ class IncrementalRelation(val sqlContext: SQLContext, log.info("Inferring schema..") val schemaResolver = new TableSchemaResolver(metaClient) val tableSchema = if (useEndInstantSchema) { - if (commitsToReturn.isEmpty) schemaResolver.getTableAvroSchemaWithoutMetadataFields() else + if (commitsToReturn.isEmpty) schemaResolver.getTableAvroSchemaWithoutMetadataFields() else schemaResolver.getTableAvroSchemaWithoutMetadataFields(commitsToReturn.last) } else { schemaResolver.getTableAvroSchemaWithoutMetadataFields() @@ -165,26 +166,63 @@ class IncrementalRelation(val sqlContext: SQLContext, if (filteredRegularFullPaths.isEmpty && filteredMetaBootstrapFullPaths.isEmpty) { sqlContext.sparkContext.emptyRDD[Row] } else { - log.info("Additional Filters to be applied to incremental source are :" + filters) + log.info("Additional Filters to be applied to incremental source are :" + filters.mkString("Array(", ", ", ")")) var df: DataFrame = sqlContext.createDataFrame(sqlContext.sparkContext.emptyRDD[Row], usedSchema) - if (metaBootstrapFileIdToFullPath.nonEmpty) { - df = sqlContext.sparkSession.read - .format("hudi") - .schema(usedSchema) - .option(DataSourceReadOptions.READ_PATHS.key, filteredMetaBootstrapFullPaths.mkString(",")) - .load() + val fallbackToFullTableScan = optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.key, + DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.defaultValue).toBoolean + + var doFullTableScan = false + + if (fallbackToFullTableScan) { + val fs = new Path(basePath).getFileSystem(sqlContext.sparkContext.hadoopConfiguration); + val timer = new HoodieTimer().startTimer(); + + val allFilesToCheck = filteredMetaBootstrapFullPaths ++ filteredRegularFullPaths + val firstNotFoundPath = allFilesToCheck.find(path => !fs.exists(new Path(path))) + val timeTaken = timer.endTimer() + log.info("Checking if paths exists took " + timeTaken + "ms") + + val optStartTs = optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key) + val isInstantArchived = optStartTs.compareTo(commitTimeline.firstInstant().get().getTimestamp) < 0 // True if optStartTs < activeTimeline.first + + if (isInstantArchived || firstNotFoundPath.isDefined) { + doFullTableScan = true + log.info("Falling back to full table scan") + } } - if (regularFileIdToFullPath.nonEmpty) { - df = df.union(sqlContext.read.options(sOpts) + if (doFullTableScan) { + val hudiDF = sqlContext.read + .format("hudi_v1") .schema(usedSchema) - .parquet(filteredRegularFullPaths.toList: _*) - .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, - commitsToReturn.head.getTimestamp)) + .load(basePath) + .filter(String.format("%s > '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, //Notice the > in place of >= because we are working with optParam instead of first commit > optParam + optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key))) .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, - commitsToReturn.last.getTimestamp))) + commitsToReturn.last.getTimestamp)) + // schema enforcement does not happen in above spark.read with hudi. hence selecting explicitly w/ right column order + val fieldNames : Array[String] = df.schema.fields.map(field => field.name) + df = df.union(hudiDF.select(fieldNames.head, fieldNames.tail: _*)) + } else { + if (metaBootstrapFileIdToFullPath.nonEmpty) { + df = sqlContext.sparkSession.read + .format("hudi_v1") + .schema(usedSchema) + .option(DataSourceReadOptions.READ_PATHS.key, filteredMetaBootstrapFullPaths.mkString(",")) + .load() + } + + if (regularFileIdToFullPath.nonEmpty) { + df = df.union(sqlContext.read.options(sOpts) + .schema(usedSchema) + .parquet(filteredRegularFullPaths.toList: _*) + .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + commitsToReturn.head.getTimestamp)) + .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + commitsToReturn.last.getTimestamp))) + } } filters.foldLeft(df)((e, f) => e.filter(f)).rdd diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala index bc83a85415de2..b9d18c68d3d60 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala @@ -17,21 +17,19 @@ package org.apache.hudi +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{GlobPattern, Path} +import org.apache.hadoop.mapred.JobConf +import org.apache.hudi.HoodieBaseRelation.createBaseFileReader import org.apache.hudi.common.model.HoodieRecord +import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.view.HoodieTableFileSystemView -import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.exception.HoodieException -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.listAffectedFilesForCommits -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getCommitMetadata -import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getWritePartitionPaths +import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.{getCommitMetadata, getWritePartitionPaths, listAffectedFilesForCommits} import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes -import org.apache.hadoop.fs.{GlobPattern, Path} -import org.apache.hadoop.mapred.JobConf -import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext} @@ -39,19 +37,19 @@ import org.apache.spark.sql.{Row, SQLContext} import scala.collection.JavaConversions._ /** - * Experimental. - * Relation, that implements the Hoodie incremental view for Merge On Read table. - * - */ -class MergeOnReadIncrementalRelation(val sqlContext: SQLContext, + * Experimental. + * Relation, that implements the Hoodie incremental view for Merge On Read table. + * + */ +class MergeOnReadIncrementalRelation(sqlContext: SQLContext, val optParams: Map[String, String], - val userSchema: StructType, + val userSchema: Option[StructType], val metaClient: HoodieTableMetaClient) - extends BaseRelation with PrunedFilteredScan { + extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) { - private val log = LogManager.getLogger(classOf[MergeOnReadIncrementalRelation]) - private val conf = sqlContext.sparkContext.hadoopConfiguration + private val conf = new Configuration(sqlContext.sparkContext.hadoopConfiguration) private val jobConf = new JobConf(conf) + private val commitTimeline = metaClient.getCommitsAndCompactionTimeline.filterCompletedInstants() if (commitTimeline.empty()) { throw new HoodieException("No instants to incrementally pull") @@ -72,91 +70,96 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext, private val commitsTimelineToReturn = commitTimeline.findInstantsInRange( optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key), optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME.key, lastInstant.getTimestamp)) - log.debug(s"${commitsTimelineToReturn.getInstants.iterator().toList.map(f => f.toString).mkString(",")}") + logDebug(s"${commitsTimelineToReturn.getInstants.iterator().toList.map(f => f.toString).mkString(",")}") private val commitsToReturn = commitsTimelineToReturn.getInstants.iterator().toList - private val schemaUtil = new TableSchemaResolver(metaClient) - private val tableAvroSchema = schemaUtil.getTableAvroSchema - private val tableStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema) + private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf) + private val fileIndex = if (commitsToReturn.isEmpty) List() else buildFileIndex() - private val preCombineField = { - val preCombineFieldFromTableConfig = metaClient.getTableConfig.getPreCombineField - if (preCombineFieldFromTableConfig != null) { - Some(preCombineFieldFromTableConfig) - } else { - // get preCombineFiled from the options if this is a old table which have not store - // the field to hoodie.properties - optParams.get(DataSourceReadOptions.READ_PRE_COMBINE_FIELD.key) - } - } - override def schema: StructType = tableStructSchema - override def needConversion: Boolean = false + private val preCombineFieldOpt = getPrecombineFieldProperty - override def unhandledFilters(filters: Array[Filter]): Array[Filter] = { - if (fileIndex.isEmpty) { - filters - } else { - val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD) - val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp) - val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp) - filters :+ isNotNullFilter :+ largerThanFilter :+ lessThanFilter - } + // Record filters making sure that only records w/in the requested bounds are being fetched as part of the + // scan collected by this relation + private lazy val incrementalSpanRecordsFilters: Seq[Filter] = { + val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD) + val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp) + val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp) + Seq(isNotNullFilter, largerThanFilter, lessThanFilter) + } + + private lazy val mandatoryColumns = { + // NOTE: This columns are required for Incremental flow to be able to handle the rows properly, even in + // cases when no columns are requested to be fetched (for ex, when using {@code count()} API) + Seq(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD) ++ + preCombineFieldOpt.map(Seq(_)).getOrElse(Seq()) } + override def needConversion: Boolean = false + override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { if (fileIndex.isEmpty) { sqlContext.sparkContext.emptyRDD[Row] } else { - log.debug(s"buildScan requiredColumns = ${requiredColumns.mkString(",")}") - log.debug(s"buildScan filters = ${filters.mkString(",")}") + logDebug(s"buildScan requiredColumns = ${requiredColumns.mkString(",")}") + logDebug(s"buildScan filters = ${filters.mkString(",")}") + // config to ensure the push down filter for parquet will be applied. sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.filterPushdown", "true") sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.recordLevelFilter.enabled", "true") sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false") - val pushDownFilter = { - val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD) - val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp) - val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp) - filters :+ isNotNullFilter :+ largerThanFilter :+ lessThanFilter - } + + val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns) + val (requiredAvroSchema, requiredStructSchema) = - MergeOnReadSnapshotRelation.getRequiredSchema(tableAvroSchema, requiredColumns) - - val hoodieTableState = HoodieMergeOnReadTableState( - tableStructSchema, - requiredStructSchema, - tableAvroSchema.toString, - requiredAvroSchema.toString, - fileIndex, - preCombineField, - Option.empty - ) - val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues( - sparkSession = sqlContext.sparkSession, - dataSchema = tableStructSchema, - partitionSchema = StructType(Nil), - requiredSchema = tableStructSchema, - filters = pushDownFilter, + HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns) + + val partitionSchema = StructType(Nil) + val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString) + val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString) + + val fullSchemaParquetReader = createBaseFileReader( + spark = sqlContext.sparkSession, + partitionSchema = partitionSchema, + tableSchema = tableSchema, + requiredSchema = tableSchema, + // This file-reader is used to read base file records, subsequently merging them with the records + // stored in delta-log files. As such, we have to read _all_ records from the base file, while avoiding + // applying any user-defined filtering _before_ we complete combining them w/ delta-log records (to make sure that + // we combine them correctly) + // + // The only filtering applicable here is the filtering to make sure we're only fetching records that + // fall into incremental span of the timeline being queried + filters = incrementalSpanRecordsFilters, options = optParams, - hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf() + // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it + // to configure Parquet reader appropriately + hadoopConf = new Configuration(conf) ) - val requiredSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues( - sparkSession = sqlContext.sparkSession, - dataSchema = tableStructSchema, - partitionSchema = StructType(Nil), - requiredSchema = requiredStructSchema, - filters = pushDownFilter, + val requiredSchemaParquetReader = createBaseFileReader( + spark = sqlContext.sparkSession, + partitionSchema = partitionSchema, + tableSchema = tableSchema, + requiredSchema = requiredSchema, + filters = filters ++ incrementalSpanRecordsFilters, options = optParams, - hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf() + // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it + // to configure Parquet reader appropriately + hadoopConf = new Configuration(conf) ) + val hoodieTableState = HoodieMergeOnReadTableState(fileIndex, HoodieRecord.RECORD_KEY_METADATA_FIELD, preCombineFieldOpt) + + // TODO implement incremental span record filtering w/in RDD to make sure returned iterator is appropriately + // filtered, since file-reader might not be capable to perform filtering val rdd = new HoodieMergeOnReadRDD( sqlContext.sparkContext, jobConf, fullSchemaParquetReader, requiredSchemaParquetReader, - hoodieTableState + hoodieTableState, + tableSchema, + requiredSchema ) rdd.asInstanceOf[RDD[Row]] } @@ -164,7 +167,7 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext, def buildFileIndex(): List[HoodieMergeOnReadFileSplit] = { val metadataList = commitsToReturn.map(instant => getCommitMetadata(instant, commitsTimelineToReturn)) - val affectedFileStatus = listAffectedFilesForCommits(new Path(metaClient.getBasePath), metadataList) + val affectedFileStatus = listAffectedFilesForCommits(conf, new Path(metaClient.getBasePath), metadataList) val fsView = new HoodieTableFileSystemView(metaClient, commitsTimelineToReturn, affectedFileStatus) // Iterate partitions to create splits @@ -173,7 +176,7 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext, ).toList val latestCommit = fsView.getLastInstant.get.getTimestamp if (log.isDebugEnabled) { - fileGroups.foreach(f => log.debug(s"current file group id: " + + fileGroups.foreach(f => logDebug(s"current file group id: " + s"${f.getFileGroupId} and file slices ${f.getLatestFileSlice.get.toString}")) } @@ -210,10 +213,9 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext, } val logPath = if (f.getLatestFileSlice.isPresent) { - //If log path doesn't exist, we still include an empty path to avoid using + // If log path doesn't exist, we still include an empty path to avoid using // the default parquet reader to ensure the push down filter will be applied. - Option(f.getLatestFileSlice.get().getLogFiles.iterator().toList - .map(logfile => logfile.getPath.toString)) + Option(f.getLatestFileSlice.get().getLogFiles.iterator().toList) } else { Option.empty @@ -223,4 +225,9 @@ class MergeOnReadIncrementalRelation(val sqlContext: SQLContext, latestCommit, metaClient.getBasePath, maxCompactionMemoryInBytes, mergeType) }) } + + private def appendMandatoryColumns(requestedColumns: Array[String]): Array[String] = { + val missing = mandatoryColumns.filter(col => !requestedColumns.contains(col)) + requestedColumns ++ missing + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala index c4d670bb62f8a..7c1a3540c814e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala @@ -18,84 +18,70 @@ package org.apache.hudi -import org.apache.avro.Schema -import org.apache.hudi.common.model.HoodieLogFile -import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapred.JobConf +import org.apache.hudi.HoodieBaseRelation.{createBaseFileReader, isMetadataTable} +import org.apache.hudi.common.model.{HoodieLogFile, HoodieRecord} +import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.view.HoodieTableFileSystemView import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes -import org.apache.hadoop.fs.Path -import org.apache.hadoop.mapred.JobConf -import org.apache.spark.internal.Logging +import org.apache.hudi.metadata.HoodieMetadataPayload import org.apache.spark.rdd.RDD -import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionedFile} -import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hudi.HoodieSqlCommonUtils -import org.apache.spark.sql.{Row, SQLContext} -import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} +import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{Row, SQLContext} import scala.collection.JavaConverters._ case class HoodieMergeOnReadFileSplit(dataFile: Option[PartitionedFile], - logPaths: Option[List[String]], + logFiles: Option[List[HoodieLogFile]], latestCommit: String, tablePath: String, maxCompactionMemoryInBytes: Long, mergeType: String) -case class HoodieMergeOnReadTableState(tableStructSchema: StructType, - requiredStructSchema: StructType, - tableAvroSchema: String, - requiredAvroSchema: String, - hoodieRealtimeFileSplits: List[HoodieMergeOnReadFileSplit], - preCombineField: Option[String], - recordKeyFieldOpt: Option[String]) - -class MergeOnReadSnapshotRelation(val sqlContext: SQLContext, - val optParams: Map[String, String], - val userSchema: StructType, - val globPaths: Option[Seq[Path]], +case class HoodieMergeOnReadTableState(hoodieRealtimeFileSplits: List[HoodieMergeOnReadFileSplit], + recordKeyField: String, + preCombineFieldOpt: Option[String]) + +class MergeOnReadSnapshotRelation(sqlContext: SQLContext, + optParams: Map[String, String], + val userSchema: Option[StructType], + val globPaths: Seq[Path], val metaClient: HoodieTableMetaClient) - extends BaseRelation with PrunedFilteredScan with Logging { + extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) { - private val conf = sqlContext.sparkContext.hadoopConfiguration + private val conf = new Configuration(sqlContext.sparkContext.hadoopConfiguration) private val jobConf = new JobConf(conf) - // use schema from latest metadata, if not present, read schema from the data file - private val schemaUtil = new TableSchemaResolver(metaClient) - private lazy val tableAvroSchema = { - try { - schemaUtil.getTableAvroSchema - } catch { - case _: Throwable => // If there is no commit in the table, we cann't get the schema - // with schemaUtil, use the userSchema instead. - SchemaConverters.toAvroType(userSchema) - } - } - private lazy val tableStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema) private val mergeType = optParams.getOrElse( DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_MERGE.defaultValue) + private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf) - private val preCombineField = { - val preCombineFieldFromTableConfig = metaClient.getTableConfig.getPreCombineField - if (preCombineFieldFromTableConfig != null) { - Some(preCombineFieldFromTableConfig) + + // If meta fields are enabled, always prefer key from the meta field as opposed to user-specified one + // NOTE: This is historical behavior which is preserved as is + private val recordKeyField = { + if (metaClient.getTableConfig.populateMetaFields()) HoodieRecord.RECORD_KEY_METADATA_FIELD + else metaClient.getTableConfig.getRecordKeyFieldProp + } + + private val preCombineFieldOpt = getPrecombineFieldProperty + + private lazy val mandatoryColumns = { + if (isMetadataTable(metaClient)) { + Seq(HoodieMetadataPayload.KEY_FIELD_NAME, HoodieMetadataPayload.SCHEMA_FIELD_NAME_TYPE) } else { - // get preCombineFiled from the options if this is a old table which have not store - // the field to hoodie.properties - optParams.get(DataSourceReadOptions.READ_PRE_COMBINE_FIELD.key) + Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq()) } } - private var recordKeyFieldOpt = Option.empty[String] - if (!metaClient.getTableConfig.populateMetaFields()) { - recordKeyFieldOpt = Option(metaClient.getTableConfig.getRecordKeyFieldProp) - } - override def schema: StructType = tableStructSchema override def needConversion: Boolean = false @@ -106,51 +92,63 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext, log.debug(s" buildScan requiredColumns = ${requiredColumns.mkString(",")}") log.debug(s" buildScan filters = ${filters.mkString(",")}") + // NOTE: In case list of requested columns doesn't contain the Primary Key one, we + // have to add it explicitly so that + // - Merging could be performed correctly + // - In case 0 columns are to be fetched (for ex, when doing {@code count()} on Spark's [[Dataset]], + // Spark still fetches all the rows to execute the query correctly + // + // It's okay to return columns that have not been requested by the caller, as those nevertheless will be + // filtered out upstream + val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns) + val (requiredAvroSchema, requiredStructSchema) = - MergeOnReadSnapshotRelation.getRequiredSchema(tableAvroSchema, requiredColumns) + HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns) val fileIndex = buildFileIndex(filters) - val hoodieTableState = HoodieMergeOnReadTableState( - tableStructSchema, - requiredStructSchema, - tableAvroSchema.toString, - requiredAvroSchema.toString, - fileIndex, - preCombineField, - recordKeyFieldOpt - ) - val fullSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues( - sparkSession = sqlContext.sparkSession, - dataSchema = tableStructSchema, - partitionSchema = StructType(Nil), - requiredSchema = tableStructSchema, + + val partitionSchema = StructType(Nil) + val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString) + val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString) + + val fullSchemaParquetReader = createBaseFileReader( + spark = sqlContext.sparkSession, + partitionSchema = partitionSchema, + tableSchema = tableSchema, + requiredSchema = tableSchema, + // This file-reader is used to read base file records, subsequently merging them with the records + // stored in delta-log files. As such, we have to read _all_ records from the base file, while avoiding + // applying any filtering _before_ we complete combining them w/ delta-log records (to make sure that + // we combine them correctly) filters = Seq.empty, options = optParams, - hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf() + // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it + // to configure Parquet reader appropriately + hadoopConf = new Configuration(conf) ) - val requiredSchemaParquetReader = new ParquetFileFormat().buildReaderWithPartitionValues( - sparkSession = sqlContext.sparkSession, - dataSchema = tableStructSchema, - partitionSchema = StructType(Nil), - requiredSchema = requiredStructSchema, + val requiredSchemaParquetReader = createBaseFileReader( + spark = sqlContext.sparkSession, + partitionSchema = partitionSchema, + tableSchema = tableSchema, + requiredSchema = requiredSchema, filters = filters, options = optParams, - hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf() + // NOTE: We have to fork the Hadoop Config here as Spark will be modifying it + // to configure Parquet reader appropriately + hadoopConf = new Configuration(conf) ) - val rdd = new HoodieMergeOnReadRDD( - sqlContext.sparkContext, - jobConf, - fullSchemaParquetReader, - requiredSchemaParquetReader, - hoodieTableState - ) + val tableState = HoodieMergeOnReadTableState(fileIndex, recordKeyField, preCombineFieldOpt) + + val rdd = new HoodieMergeOnReadRDD(sqlContext.sparkContext, jobConf, fullSchemaParquetReader, + requiredSchemaParquetReader, tableState, tableSchema, requiredSchema) + rdd.asInstanceOf[RDD[Row]] } def buildFileIndex(filters: Array[Filter]): List[HoodieMergeOnReadFileSplit] = { - if (globPaths.isDefined) { + if (globPaths.nonEmpty) { // Load files from the global paths if it has defined to be compatible with the original mode - val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sqlContext.sparkSession, globPaths.get) + val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sqlContext.sparkSession, globPaths) val fsView = new HoodieTableFileSystemView(metaClient, // file-slice after pending compaction-requested instant-time is also considered valid metaClient.getCommitsAndCompactionTimeline.filterCompletedAndCompactionInstants, @@ -196,10 +194,12 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext, val partitionFilters = filters.filter(f => f.references.forall(p => partitionColumns.contains(p))) val partitionFilterExpression = HoodieSparkUtils.convertToCatalystExpressions(partitionFilters, tableStructSchema) + val convertedPartitionFilterExpression = + HoodieFileIndex.convertFilterForTimestampKeyGenerator(metaClient, partitionFilterExpression.toSeq) // If convert success to catalyst expression, use the partition prune - val fileSlices = if (partitionFilterExpression.isDefined) { - hoodieFileIndex.listFileSlices(Seq(partitionFilterExpression.get)) + val fileSlices = if (convertedPartitionFilterExpression.nonEmpty) { + hoodieFileIndex.listFileSlices(convertedPartitionFilterExpression) } else { hoodieFileIndex.listFileSlices(Seq.empty[Expression]) } @@ -221,8 +221,7 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext, Option.empty } - val logPaths = fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala - .map(logFile => MergeOnReadSnapshotRelation.getFilePath(logFile.getPath)).toList + val logPaths = fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala.toList val logPathsOptional = if (logPaths.isEmpty) Option.empty else Option(logPaths) HoodieMergeOnReadFileSplit(partitionedFile, logPathsOptional, queryInstant, metaClient.getBasePath, @@ -232,6 +231,11 @@ class MergeOnReadSnapshotRelation(val sqlContext: SQLContext, } } } + + private def appendMandatoryColumns(requestedColumns: Array[String]): Array[String] = { + val missing = mandatoryColumns.filter(col => !requestedColumns.contains(col)) + requestedColumns ++ missing + } } object MergeOnReadSnapshotRelation { @@ -252,14 +256,4 @@ object MergeOnReadSnapshotRelation { path.toUri.toString } - def getRequiredSchema(tableAvroSchema: Schema, requiredColumns: Array[String]): (Schema, StructType) = { - // First get the required avro-schema, then convert the avro-schema to spark schema. - val name2Fields = tableAvroSchema.getFields.asScala.map(f => f.name() -> f).toMap - val requiredFields = requiredColumns.map(c => name2Fields(c)) - .map(f => new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal(), f.order())).toList - val requiredAvroSchema = Schema.createRecord(tableAvroSchema.getName, tableAvroSchema.getDoc, - tableAvroSchema.getNamespace, tableAvroSchema.isError, requiredFields.asJava) - val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema) - (requiredAvroSchema, requiredStructSchema) - } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkConfigs.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkConfigs.scala new file mode 100644 index 0000000000000..75dee2108914f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkConfigs.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi + +object SparkConfigs { + + // spark data source write pool name. Incase of streaming sink, users might be interested to set custom scheduling configs + // for regular writes and async compaction. In such cases, this pool name will be used for spark datasource writes. + val SPARK_DATASOURCE_WRITER_POOL_NAME = "sparkdatasourcewrite" + + /* + When async compaction is enabled (deltastreamer or streaming sink), users might be interested to set custom + scheduling configs for regular writes and async compaction. This is the property used to set custom scheduler config + file with spark. In Deltastreamer, the file is generated within hudi and set if necessary. Where as in case of streaming + sink, users have to set this property when they invoke spark shell. + Sample format of the file contents. + + + + FAIR + 4 + 2 + + + FAIR + 3 + 1 + + + */ + val SPARK_SCHEDULER_ALLOCATION_FILE_KEY = "spark.scheduler.allocation.file" + +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala index a79ac6f1db73b..46201c4132078 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala @@ -18,8 +18,9 @@ package org.apache.hudi import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hudi.BaseHoodieTableFileIndex.PartitionPath import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_INCREMENTAL_OPT_VAL, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, QUERY_TYPE_SNAPSHOT_OPT_VAL} -import org.apache.hudi.SparkHoodieTableFileIndex.{deduceQueryType, generateFieldMap} +import org.apache.hudi.SparkHoodieTableFileIndex.{deduceQueryType, generateFieldMap, toJavaOption} import org.apache.hudi.client.common.HoodieSparkEngineContext import org.apache.hudi.common.config.TypedProperties import org.apache.hudi.common.model.{FileSlice, HoodieTableQueryType} @@ -36,10 +37,11 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.unsafe.types.UTF8String -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ +import scala.language.implicitConversions /** - * Implementation of the [[HoodieTableFileIndexBase]] for Spark + * Implementation of the [[BaseHoodieTableFileIndex]] for Spark * * @param spark spark session * @param metaClient Hudi table's meta-client @@ -55,14 +57,16 @@ class SparkHoodieTableFileIndex(spark: SparkSession, queryPaths: Seq[Path], specifiedQueryInstant: Option[String] = None, @transient fileStatusCache: FileStatusCache = NoopCache) - extends HoodieTableFileIndexBase( - engineContext = new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)), + extends BaseHoodieTableFileIndex( + new HoodieSparkEngineContext(new JavaSparkContext(spark.sparkContext)), metaClient, configProperties, - queryType = deduceQueryType(configProperties), - queryPaths, - specifiedQueryInstant, - fileStatusCache = SparkHoodieTableFileIndex.adapt(fileStatusCache) + deduceQueryType(configProperties), + queryPaths.asJava, + toJavaOption(specifiedQueryInstant), + false, + false, + SparkHoodieTableFileIndex.adapt(fileStatusCache) ) with SparkAdapterSupport with Logging { @@ -136,9 +140,9 @@ class SparkHoodieTableFileIndex(spark: SparkSession, */ def listFileSlices(partitionFilters: Seq[Expression]): Map[String, Seq[FileSlice]] = { // Prune the partition path by the partition filters - val prunedPartitions = prunePartition(cachedAllInputFileSlices.keys.toSeq, partitionFilters) + val prunedPartitions = prunePartition(cachedAllInputFileSlices.asScala.keys.toSeq, partitionFilters) prunedPartitions.map(partition => { - (partition.path, cachedAllInputFileSlices(partition)) + (partition.path, cachedAllInputFileSlices.get(partition).asScala) }).toMap } @@ -150,9 +154,7 @@ class SparkHoodieTableFileIndex(spark: SparkSession, * @param predicates The filter condition. * @return The Pruned partition paths. */ - def prunePartition(partitionPaths: Seq[PartitionPath], - predicates: Seq[Expression]): Seq[PartitionPath] = { - + def prunePartition(partitionPaths: Seq[PartitionPath], predicates: Seq[Expression]): Seq[PartitionPath] = { val partitionColumnNames = partitionSchema.fields.map(_.name).toSet val partitionPruningPredicates = predicates.filter { _.references.map(_.name).toSet.subsetOf(partitionColumnNames) @@ -167,8 +169,9 @@ class SparkHoodieTableFileIndex(spark: SparkSession, }) val prunedPartitionPaths = partitionPaths.filter { - case PartitionPath(_, values) => boundPredicate.eval(InternalRow.fromSeq(values)) + partitionPath => boundPredicate.eval(InternalRow.fromSeq(partitionPath.values)) } + logInfo(s"Total partition size is: ${partitionPaths.size}," + s" after partition prune size is: ${prunedPartitionPaths.size}") prunedPartitionPaths @@ -177,7 +180,7 @@ class SparkHoodieTableFileIndex(spark: SparkSession, } } - protected def parsePartitionColumnValues(partitionColumns: Array[String], partitionPath: String): Array[Any] = { + protected def parsePartitionColumnValues(partitionColumns: Array[String], partitionPath: String): Array[Object] = { if (partitionColumns.length == 0) { // This is a non-partitioned table Array.empty @@ -225,7 +228,7 @@ class SparkHoodieTableFileIndex(spark: SparkSession, val pathWithPartitionName = new Path(basePath, partitionWithName) val partitionValues = parsePartitionPath(pathWithPartitionName, partitionSchema) - partitionValues.toArray + partitionValues.map(_.asInstanceOf[Object]).toArray } } } @@ -247,6 +250,13 @@ class SparkHoodieTableFileIndex(spark: SparkSession, object SparkHoodieTableFileIndex { + implicit def toJavaOption[T](opt: Option[T]): org.apache.hudi.common.util.Option[T] = + if (opt.isDefined) { + org.apache.hudi.common.util.Option.of(opt.get) + } else { + org.apache.hudi.common.util.Option.empty() + } + /** * This method unravels [[StructType]] into a [[Map]] of pairs of dot-path notation with corresponding * [[StructField]] object for every field of the provided [[StructType]], recursively. @@ -287,17 +297,17 @@ object SparkHoodieTableFileIndex { } private def deduceQueryType(configProperties: TypedProperties): HoodieTableQueryType = { - configProperties(QUERY_TYPE.key()) match { - case QUERY_TYPE_SNAPSHOT_OPT_VAL => HoodieTableQueryType.QUERY_TYPE_SNAPSHOT - case QUERY_TYPE_INCREMENTAL_OPT_VAL => HoodieTableQueryType.QUERY_TYPE_INCREMENTAL - case QUERY_TYPE_READ_OPTIMIZED_OPT_VAL => HoodieTableQueryType.QUERY_TYPE_READ_OPTIMIZED + configProperties.asScala(QUERY_TYPE.key()) match { + case QUERY_TYPE_SNAPSHOT_OPT_VAL => HoodieTableQueryType.SNAPSHOT + case QUERY_TYPE_INCREMENTAL_OPT_VAL => HoodieTableQueryType.INCREMENTAL + case QUERY_TYPE_READ_OPTIMIZED_OPT_VAL => HoodieTableQueryType.READ_OPTIMIZED case _ @ qt => throw new IllegalArgumentException(s"query-type ($qt) not supported") } } - private def adapt(cache: FileStatusCache): FileStatusCacheTrait = { - new FileStatusCacheTrait { - override def get(path: Path): Option[Array[FileStatus]] = cache.getLeafFiles(path) + private def adapt(cache: FileStatusCache): BaseHoodieTableFileIndex.FileStatusCache = { + new BaseHoodieTableFileIndex.FileStatusCache { + override def get(path: Path): org.apache.hudi.common.util.Option[Array[FileStatus]] = toJavaOption(cache.getLeafFiles(path)) override def put(path: Path, leafFiles: Array[FileStatus]): Unit = cache.putLeafFiles(path, leafFiles) override def invalidate(): Unit = cache.invalidateAll() } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala index b464c2dc5d611..050efbd3d22c2 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieAvroSerializer.scala @@ -20,9 +20,10 @@ package org.apache.spark.sql.avro import org.apache.avro.Schema import org.apache.spark.sql.types.DataType -/** - * As AvroSerializer cannot be access out of the spark.sql.avro package since spark 3.1, we define - * this class to be accessed by other class. - */ -case class HoodieAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean) - extends AvroSerializer(rootCatalystType, rootAvroType, nullable) +class HoodieAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean) + extends HoodieAvroSerializerTrait { + + val avroSerializer = new AvroSerializer(rootCatalystType, rootAvroType, nullable) + + override def serialize(catalystData: Any): Any = avroSerializer.serialize(catalystData) +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala index f14ccbe6066c0..98823d14222d9 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala @@ -25,17 +25,15 @@ import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.ValidationUtils import org.apache.hudi.keygen.ComplexKeyGenerator import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory - import org.apache.spark.internal.Logging -import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.hudi.HoodieOptionConfig import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.{AnalysisException, SparkSession} import java.util.{Locale, Properties} - import scala.collection.JavaConverters._ import scala.collection.mutable diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala index 4901c0d39117d..1e1e9c663e54f 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala @@ -32,11 +32,11 @@ import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{Resolver, UnresolvedRelation} import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} -import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Expression} +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Cast, Expression, Literal} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.internal.StaticSQLConf -import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.sql.types.{DataType, NullType, StringType, StructField, StructType} import org.apache.spark.sql.{Column, DataFrame, SparkSession} import java.net.URI @@ -54,24 +54,6 @@ object HoodieSqlCommonUtils extends SparkAdapterSupport { override def get() = new SimpleDateFormat("yyyy-MM-dd") }) - def isHoodieTable(table: CatalogTable): Boolean = { - table.provider.map(_.toLowerCase(Locale.ROOT)).orNull == "hudi" - } - - def isHoodieTable(tableId: TableIdentifier, spark: SparkSession): Boolean = { - val table = spark.sessionState.catalog.getTableMetadata(tableId) - isHoodieTable(table) - } - - def isHoodieTable(table: LogicalPlan, spark: SparkSession): Boolean = { - tripAlias(table) match { - case LogicalRelation(_, _, Some(tbl), _) => isHoodieTable(tbl) - case relation: UnresolvedRelation => - isHoodieTable(sparkAdapter.toTableIdentifier(relation), spark) - case _=> false - } - } - def getTableIdentifier(table: LogicalPlan): TableIdentifier = { table match { case SubqueryAlias(name, _) => sparkAdapter.toTableIdentifier(name) @@ -200,16 +182,29 @@ object HoodieSqlCommonUtils extends SparkAdapterSupport { getTableLocation(table, spark) } + def getTableLocation(properties: Map[String, String], identifier: TableIdentifier, sparkSession: SparkSession): String = { + val location: Option[String] = Some(properties.getOrElse("location", "")) + val isManaged = location.isEmpty || location.get.isEmpty + val uri = if (isManaged) { + Some(sparkSession.sessionState.catalog.defaultTablePath(identifier)) + } else { + Some(new Path(location.get).toUri) + } + getTableLocation(uri, identifier, sparkSession) + } + def getTableLocation(table: CatalogTable, sparkSession: SparkSession): String = { - val uri = if (table.tableType == CatalogTableType.MANAGED && isHoodieTable(table)) { + val uri = table.storage.locationUri.orElse { Some(sparkSession.sessionState.catalog.defaultTablePath(table.identifier)) - } else { - table.storage.locationUri } + getTableLocation(uri, table.identifier, sparkSession) + } + + def getTableLocation(uri: Option[URI], identifier: TableIdentifier, sparkSession: SparkSession): String = { val conf = sparkSession.sessionState.newHadoopConf() uri.map(makePathQualified(_, conf)) .map(removePlaceHolder) - .getOrElse(throw new IllegalArgumentException(s"Missing location for ${table.identifier}")) + .getOrElse(throw new IllegalArgumentException(s"Missing location for ${identifier}")) } private def removePlaceHolder(path: String): String = { @@ -312,4 +307,18 @@ object HoodieSqlCommonUtils extends SparkAdapterSupport { case field if resolver(field.name, name) => field } } + + // Compare a [[StructField]] to another, return true if they have the same column + // name(by resolver) and dataType. + def columnEqual(field: StructField, other: StructField, resolver: Resolver): Boolean = { + resolver(field.name, other.name) && field.dataType == other.dataType + } + + def castIfNeeded(child: Expression, dataType: DataType, conf: SQLConf): Expression = { + child match { + case Literal(nul, NullType) => Literal(nul, dataType) + case _ => if (child.dataType != dataType) + Cast(child, dataType, Option(conf.sessionLocalTimeZone)) else child + } + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala new file mode 100644 index 0000000000000..d6745b6795032 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/ProvidesHoodieConfig.scala @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME +import org.apache.hudi.hive.MultiPartKeysValueExtractor +import org.apache.hudi.hive.ddl.HiveSyncMode +import org.apache.hudi.keygen.ComplexKeyGenerator +import org.apache.hudi.sql.InsertMode +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{isEnableHive, withSparkConf} +import org.apache.spark.sql.hudi.command.{SqlKeyGenerator, ValidateDuplicateKeyPayload} + +import scala.collection.JavaConverters.propertiesAsScalaMapConverter + +trait ProvidesHoodieConfig extends Logging { + + def buildHoodieConfig(hoodieCatalogTable: HoodieCatalogTable): Map[String, String] = { + val sparkSession: SparkSession = hoodieCatalogTable.spark + val catalogProperties = hoodieCatalogTable.catalogProperties + val tableConfig = hoodieCatalogTable.tableConfig + val tableId = hoodieCatalogTable.table.identifier + + // NOTE: Here we fallback to "" to make sure that null value is not overridden with + // default value ("ts") + // TODO(HUDI-3456) clean up + val preCombineField = Option(tableConfig.getPreCombineField).getOrElse("") + + require(hoodieCatalogTable.primaryKeys.nonEmpty, + s"There are no primary key in table ${hoodieCatalogTable.table.identifier}, cannot execute update operator") + val enableHive = isEnableHive(sparkSession) + + withSparkConf(sparkSession, catalogProperties) { + Map.apply( + "path" -> hoodieCatalogTable.tableLocation, + RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), + TBL_NAME.key -> hoodieCatalogTable.tableName, + PRECOMBINE_FIELD.key -> preCombineField, + HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning, + KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, + SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, + OPERATION.key -> UPSERT_OPERATION_OPT_VAL, + PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, + META_SYNC_ENABLED.key -> enableHive.toString, + HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), + HIVE_USE_JDBC.key -> "false", + HIVE_DATABASE.key -> tableId.database.getOrElse("default"), + HIVE_TABLE.key -> tableId.table, + HIVE_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, + HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, + HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", + HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", + SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL + ) + .filter { case(_, v) => v != null } + } + } + + /** + * Build the default config for insert. + * + * @return + */ + def buildHoodieInsertConfig(hoodieCatalogTable: HoodieCatalogTable, + sparkSession: SparkSession, + isOverwrite: Boolean, + insertPartitions: Map[String, Option[String]] = Map.empty, + extraOptions: Map[String, String]): Map[String, String] = { + + if (insertPartitions.nonEmpty && + (insertPartitions.keys.toSet != hoodieCatalogTable.partitionFields.toSet)) { + throw new IllegalArgumentException(s"Insert partition fields" + + s"[${insertPartitions.keys.mkString(" ")}]" + + s" not equal to the defined partition in table[${hoodieCatalogTable.partitionFields.mkString(",")}]") + } + val path = hoodieCatalogTable.tableLocation + val tableType = hoodieCatalogTable.tableTypeName + val tableConfig = hoodieCatalogTable.tableConfig + val tableSchema = hoodieCatalogTable.tableSchema + + val options = hoodieCatalogTable.catalogProperties ++ tableConfig.getProps.asScala.toMap ++ extraOptions + val parameters = withSparkConf(sparkSession, options)() + + val partitionFieldsStr = hoodieCatalogTable.partitionFields.mkString(",") + + // NOTE: Here we fallback to "" to make sure that null value is not overridden with + // default value ("ts") + // TODO(HUDI-3456) clean up + val preCombineField = hoodieCatalogTable.preCombineKey.getOrElse("") + + val hiveStylePartitioningEnable = Option(tableConfig.getHiveStylePartitioningEnable).getOrElse("true") + val urlEncodePartitioning = Option(tableConfig.getUrlEncodePartitioning).getOrElse("false") + val keyGeneratorClassName = Option(tableConfig.getKeyGeneratorClassName) + .getOrElse(classOf[ComplexKeyGenerator].getCanonicalName) + + val enableBulkInsert = parameters.getOrElse(DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key, + DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.defaultValue()).toBoolean + val dropDuplicate = sparkSession.conf + .getOption(INSERT_DROP_DUPS.key).getOrElse(INSERT_DROP_DUPS.defaultValue).toBoolean + + val insertMode = InsertMode.of(parameters.getOrElse(DataSourceWriteOptions.SQL_INSERT_MODE.key, + DataSourceWriteOptions.SQL_INSERT_MODE.defaultValue())) + val isNonStrictMode = insertMode == InsertMode.NON_STRICT + val isPartitionedTable = hoodieCatalogTable.partitionFields.nonEmpty + val hasPrecombineColumn = hoodieCatalogTable.preCombineKey.nonEmpty + val operation = + (enableBulkInsert, isOverwrite, dropDuplicate, isNonStrictMode, isPartitionedTable) match { + case (true, _, _, false, _) => + throw new IllegalArgumentException(s"Table with primaryKey can not use bulk insert in ${insertMode.value()} mode.") + case (true, true, _, _, true) => + throw new IllegalArgumentException(s"Insert Overwrite Partition can not use bulk insert.") + case (true, _, true, _, _) => + throw new IllegalArgumentException(s"Bulk insert cannot support drop duplication." + + s" Please disable $INSERT_DROP_DUPS and try again.") + // if enableBulkInsert is true, use bulk insert for the insert overwrite non-partitioned table. + case (true, true, _, _, false) => BULK_INSERT_OPERATION_OPT_VAL + // insert overwrite table + case (false, true, _, _, false) => INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL + // insert overwrite partition + case (_, true, _, _, true) => INSERT_OVERWRITE_OPERATION_OPT_VAL + // disable dropDuplicate, and provide preCombineKey, use the upsert operation for strict and upsert mode. + case (false, false, false, false, _) if hasPrecombineColumn => UPSERT_OPERATION_OPT_VAL + // if table is pk table and has enableBulkInsert use bulk insert for non-strict mode. + case (true, _, _, true, _) => BULK_INSERT_OPERATION_OPT_VAL + // for the rest case, use the insert operation + case _ => INSERT_OPERATION_OPT_VAL + } + + val payloadClassName = if (operation == UPSERT_OPERATION_OPT_VAL && + tableType == COW_TABLE_TYPE_OPT_VAL && insertMode == InsertMode.STRICT) { + // Only validate duplicate key for COW, for MOR it will do the merge with the DefaultHoodieRecordPayload + // on reading. + classOf[ValidateDuplicateKeyPayload].getCanonicalName + } else { + classOf[OverwriteWithLatestAvroPayload].getCanonicalName + } + + logInfo(s"Insert statement use write operation type: $operation, payloadClass: $payloadClassName") + + val enableHive = isEnableHive(sparkSession) + + withSparkConf(sparkSession, options) { + Map( + "path" -> path, + TABLE_TYPE.key -> tableType, + TBL_NAME.key -> hoodieCatalogTable.tableName, + OPERATION.key -> operation, + HIVE_STYLE_PARTITIONING.key -> hiveStylePartitioningEnable, + URL_ENCODE_PARTITIONING.key -> urlEncodePartitioning, + KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, + SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> keyGeneratorClassName, + RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), + PRECOMBINE_FIELD.key -> preCombineField, + PARTITIONPATH_FIELD.key -> partitionFieldsStr, + PAYLOAD_CLASS_NAME.key -> payloadClassName, + ENABLE_ROW_WRITER.key -> enableBulkInsert.toString, + HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPrecombineColumn), + HIVE_PARTITION_FIELDS.key -> partitionFieldsStr, + META_SYNC_ENABLED.key -> enableHive.toString, + HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), + HIVE_USE_JDBC.key -> "false", + HIVE_DATABASE.key -> hoodieCatalogTable.table.identifier.database.getOrElse("default"), + HIVE_TABLE.key -> hoodieCatalogTable.table.identifier.table, + HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", + HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, + HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "200", + HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", + SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL + ) + .filter { case (_, v) => v != null } + } + } + +} diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala index c6c79f431337e..c4f5cd39f6073 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableAddColumnsCommand.scala @@ -57,7 +57,8 @@ case class AlterHoodieTableAddColumnsCommand( s" table columns is: [${hoodieCatalogTable.tableSchemaWithoutMetaFields.fieldNames.mkString(",")}]") } // Get the new schema - val newSqlSchema = StructType(tableSchema.fields ++ colsToAdd) + val rearrangedSchema = hoodieCatalogTable.dataSchema ++ colsToAdd ++ hoodieCatalogTable.partitionSchema + val newSqlSchema = StructType(rearrangedSchema) val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tableId.table) val newSchema = AvroConversionUtils.convertStructTypeToAvroSchema(newSqlSchema, structName, nameSpace) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala index befda70680f85..3aa5ca945486e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/AlterHoodieTableChangeColumnCommand.scala @@ -49,6 +49,13 @@ case class AlterHoodieTableChangeColumnCommand( throw new AnalysisException(s"Can't find column `$columnName` given table data columns " + s"${hoodieCatalogTable.dataSchema.fieldNames.mkString("[`", "`, `", "`]")}") ) + // Throw an AnalysisException if the column name/dataType is changed. + if (!columnEqual(originColumn, newColumn, resolver)) { + throw new AnalysisException( + "ALTER TABLE CHANGE COLUMN is not supported for changing column " + + s"'${originColumn.name}' with type '${originColumn.dataType}' to " + + s"'${newColumn.name}' with type '${newColumn.dataType}'") + } // Get the new schema val newTableSchema = StructType( diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala index e4392380465ee..9d139389fd235 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/SqlKeyGenerator.scala @@ -47,7 +47,7 @@ class SqlKeyGenerator(props: TypedProperties) extends ComplexKeyGenerator(props) // The origin key generator class for this table. private lazy val originKeyGen = { val beforeKeyGenClassName = props.getString(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME, null) - if (beforeKeyGenClassName != null) { + if (beforeKeyGenClassName != null && beforeKeyGenClassName.nonEmpty) { val keyGenProps = new TypedProperties() keyGenProps.putAll(props) keyGenProps.remove(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME) @@ -132,7 +132,7 @@ object SqlKeyGenerator { def getRealKeyGenClassName(props: TypedProperties): String = { val beforeKeyGenClassName = props.getString(SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME, null) - if (beforeKeyGenClassName != null) { + if (beforeKeyGenClassName != null && beforeKeyGenClassName.nonEmpty) { HoodieSparkKeyGeneratorFactory.convertToSparkKeyGenerator(beforeKeyGenClassName) } else { classOf[ComplexKeyGenerator].getCanonicalName diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala index 509746bae160f..947291d10373b 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionCodeGen.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.hudi.command.payload -import java.util.UUID import org.apache.avro.generic.{GenericRecord, IndexedRecord} import org.apache.hudi.sql.IExpressionEvaluator import org.apache.spark.executor.InputMetrics @@ -37,6 +36,8 @@ import org.apache.spark.{TaskContext, TaskKilledException} import org.codehaus.commons.compiler.CompileException import org.codehaus.janino.{ClassBodyEvaluator, InternalCompilerException} +import java.util.UUID + /** * Do CodeGen for expression based on IndexedRecord. * The mainly difference with the spark's CodeGen for expression is that diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala index 0800d1712d978..e59a609321549 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/ExpressionPayload.scala @@ -17,14 +17,9 @@ package org.apache.spark.sql.hudi.command.payload -import java.util.{Base64, Properties} -import java.util.concurrent.Callable - import com.google.common.cache.CacheBuilder - import org.apache.avro.Schema import org.apache.avro.generic.{GenericData, GenericRecord, IndexedRecord} - import org.apache.hudi.AvroConversionUtils import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.avro.HoodieAvroUtils @@ -34,13 +29,14 @@ import org.apache.hudi.common.util.{ValidationUtils, Option => HOption} import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.io.HoodieWriteHandle import org.apache.hudi.sql.IExpressionEvaluator - -import org.apache.spark.sql.avro.{AvroSerializer, HoodieAvroSerializer, SchemaConverters} +import org.apache.spark.sql.avro.{AvroSerializer, SchemaConverters} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.hudi.SerDeUtils import org.apache.spark.sql.hudi.command.payload.ExpressionPayload.getEvaluator import org.apache.spark.sql.types.{StructField, StructType} +import java.util.concurrent.Callable +import java.util.{Base64, Properties} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer @@ -315,7 +311,7 @@ object ExpressionPayload { val conditionEvaluator = ExpressionCodeGen.doCodeGen(Seq(condition), conditionSerializer) val assignSqlType = AvroConversionUtils.convertAvroSchemaToStructType(writeSchema) - val assignSerializer = new HoodieAvroSerializer(assignSqlType, writeSchema, false) + val assignSerializer = new AvroSerializer(assignSqlType, writeSchema, false) val assignmentEvaluator = ExpressionCodeGen.doCodeGen(assignments, assignSerializer) conditionEvaluator -> assignmentEvaluator } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala index 749761443547a..29025877b48c9 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/command/payload/SqlTypedRecord.scala @@ -17,22 +17,19 @@ package org.apache.spark.sql.hudi.command.payload -import org.apache.avro.generic.IndexedRecord import org.apache.avro.Schema - -import org.apache.hudi.AvroConversionUtils - -import org.apache.spark.sql.avro.HoodieAvroDeserializer +import org.apache.avro.generic.IndexedRecord +import org.apache.hudi.{AvroConversionUtils, SparkAdapterSupport} import org.apache.spark.sql.catalyst.InternalRow /** * A sql typed record which will convert the avro field to sql typed value. */ -class SqlTypedRecord(val record: IndexedRecord) extends IndexedRecord { +class SqlTypedRecord(val record: IndexedRecord) extends IndexedRecord with SparkAdapterSupport { private lazy val sqlType = AvroConversionUtils.convertAvroSchemaToStructType(getSchema) - private lazy val avroDeserializer = HoodieAvroDeserializer(record.getSchema, sqlType) - private lazy val sqlRow = avroDeserializer.deserializeData(record).asInstanceOf[InternalRow] + private lazy val avroDeserializer = sparkAdapter.createAvroDeserializer(record.getSchema, sqlType) + private lazy val sqlRow = avroDeserializer.deserialize(record).get.asInstanceOf[InternalRow] override def put(i: Int, v: Any): Unit = { record.put(i, v) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala index ffe9b64984027..4e46233c3596e 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/streaming/HoodieStreamSource.scala @@ -161,12 +161,12 @@ class HoodieStreamSource( val rdd = tableType match { case HoodieTableType.COPY_ON_WRITE => val serDe = sparkAdapter.createSparkRowSerDe(RowEncoder(schema)) - new IncrementalRelation(sqlContext, incParams, schema, metaClient) + new IncrementalRelation(sqlContext, incParams, Some(schema), metaClient) .buildScan() .map(serDe.serializeRow) case HoodieTableType.MERGE_ON_READ => val requiredColumns = schema.fields.map(_.name) - new MergeOnReadIncrementalRelation(sqlContext, incParams, schema, metaClient) + new MergeOnReadIncrementalRelation(sqlContext, incParams, Some(schema), metaClient) .buildScan(requiredColumns, Array.empty[Filter]) .asInstanceOf[RDD[InternalRow]] case _ => throw new IllegalArgumentException(s"UnSupport tableType: $tableType") diff --git a/hudi-spark-datasource/hudi-spark/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlCommon.g4 b/hudi-spark-datasource/hudi-spark/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlCommon.g4 index 74f83438f659c..0cde14a4e4a0e 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlCommon.g4 +++ b/hudi-spark-datasource/hudi-spark/src/main/antlr4/org/apache/hudi/spark/sql/parser/HoodieSqlCommon.g4 @@ -14,59 +14,197 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -grammar HoodieSqlCommon; + + grammar HoodieSqlCommon; + + @lexer::members { + /** + * Verify whether current token is a valid decimal token (which contains dot). + * Returns true if the character that follows the token is not a digit or letter or underscore. + * + * For example: + * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'. + * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'. + * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'. + * For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is followed + * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+' + * which is not a digit or letter or underscore. + */ + public boolean isValidDecimal() { + int nextChar = _input.LA(1); + if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' || + nextChar == '_') { + return false; + } else { + return true; + } + } +} singleStatement : statement EOF ; -statement - : compactionStatement #compactionCommand - | .*? #passThrough + statement + : compactionStatement #compactionCommand + | CALL multipartIdentifier '(' (callArgument (',' callArgument)*)? ')' #call + | .*? #passThrough ; compactionStatement - : operation = (RUN | SCHEDULE) COMPACTION ON tableIdentifier (AT instantTimestamp = NUMBER)? #compactionOnTable - | operation = (RUN | SCHEDULE) COMPACTION ON path = STRING (AT instantTimestamp = NUMBER)? #compactionOnPath - | SHOW COMPACTION ON tableIdentifier (LIMIT limit = NUMBER)? #showCompactionOnTable - | SHOW COMPACTION ON path = STRING (LIMIT limit = NUMBER)? #showCompactionOnPath + : operation = (RUN | SCHEDULE) COMPACTION ON tableIdentifier (AT instantTimestamp = INTEGER_VALUE)? #compactionOnTable + | operation = (RUN | SCHEDULE) COMPACTION ON path = STRING (AT instantTimestamp = INTEGER_VALUE)? #compactionOnPath + | SHOW COMPACTION ON tableIdentifier (LIMIT limit = INTEGER_VALUE)? #showCompactionOnTable + | SHOW COMPACTION ON path = STRING (LIMIT limit = INTEGER_VALUE)? #showCompactionOnPath ; tableIdentifier : (db=IDENTIFIER '.')? table=IDENTIFIER ; + callArgument + : expression #positionalArgument + | identifier '=>' expression #namedArgument + ; + + expression + : constant + | stringMap + ; + + constant + : number #numericLiteral + | booleanValue #booleanLiteral + | STRING+ #stringLiteral + | identifier STRING #typeConstructor + ; + + stringMap + : MAP '(' constant (',' constant)* ')' + ; + + booleanValue + : TRUE | FALSE + ; + + number + : MINUS? EXPONENT_VALUE #exponentLiteral + | MINUS? DECIMAL_VALUE #decimalLiteral + | MINUS? INTEGER_VALUE #integerLiteral + | MINUS? BIGINT_LITERAL #bigIntLiteral + | MINUS? SMALLINT_LITERAL #smallIntLiteral + | MINUS? TINYINT_LITERAL #tinyIntLiteral + | MINUS? DOUBLE_LITERAL #doubleLiteral + | MINUS? FLOAT_LITERAL #floatLiteral + | MINUS? BIGDECIMAL_LITERAL #bigDecimalLiteral + ; + + multipartIdentifier + : parts+=identifier ('.' parts+=identifier)* + ; + + identifier + : IDENTIFIER #unquotedIdentifier + | quotedIdentifier #quotedIdentifierAlternative + | nonReserved #unquotedIdentifier + ; + + quotedIdentifier + : BACKQUOTED_IDENTIFIER + ; + + nonReserved + : CALL | COMPACTION | RUN | SCHEDULE | ON | SHOW | LIMIT + ; + ALL: 'ALL'; AT: 'AT'; + CALL: 'CALL'; COMPACTION: 'COMPACTION'; RUN: 'RUN'; SCHEDULE: 'SCHEDULE'; ON: 'ON'; SHOW: 'SHOW'; LIMIT: 'LIMIT'; + MAP: 'MAP'; + NULL: 'NULL'; + TRUE: 'TRUE'; + FALSE: 'FALSE'; + INTERVAL: 'INTERVAL'; + TO: 'TO'; + + PLUS: '+'; + MINUS: '-'; + + STRING + : '\'' ( ~('\''|'\\') | ('\\' .) )* '\'' + | '"' ( ~('"'|'\\') | ('\\' .) )* '"' + ; - NUMBER + BIGINT_LITERAL + : DIGIT+ 'L' + ; + + SMALLINT_LITERAL + : DIGIT+ 'S' + ; + + TINYINT_LITERAL + : DIGIT+ 'Y' + ; + + INTEGER_VALUE : DIGIT+ ; + EXPONENT_VALUE + : DIGIT+ EXPONENT + | DECIMAL_DIGITS EXPONENT {isValidDecimal()}? + ; + + DECIMAL_VALUE + : DECIMAL_DIGITS {isValidDecimal()}? + ; + + FLOAT_LITERAL + : DIGIT+ EXPONENT? 'F' + | DECIMAL_DIGITS EXPONENT? 'F' {isValidDecimal()}? + ; + + DOUBLE_LITERAL + : DIGIT+ EXPONENT? 'D' + | DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}? + ; + + BIGDECIMAL_LITERAL + : DIGIT+ EXPONENT? 'BD' + | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}? + ; + IDENTIFIER - : (LETTER | DIGIT | '_')+ - ; + : (LETTER | DIGIT | '_')+ + ; -STRING - : '\'' ( ~('\''|'\\') | ('\\' .) )* '\'' - | '"' ( ~('"'|'\\') | ('\\' .) )* '"' + BACKQUOTED_IDENTIFIER + : '`' ( ~'`' | '``' )* '`' ; + fragment DECIMAL_DIGITS + : DIGIT+ '.' DIGIT* + | '.' DIGIT+ + ; + fragment EXPONENT + : 'E' [+-]? DIGIT+ + ; fragment DIGIT - : [0-9] - ; + : [0-9] + ; fragment LETTER - : [A-Z] - ; + : [A-Z] + ; SIMPLE_COMMENT : '--' ~[\r\n]* '\r'? '\n'? -> channel(HIDDEN) diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java index e0929efed1f87..9aa7ac1a664cd 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/QuickstartUtils.java @@ -19,6 +19,7 @@ package org.apache.hudi; import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; @@ -149,7 +150,7 @@ public Stream generateInsertsStream(String randomString, Integer n existingKeys.put(currSize + i, key); numExistingKeys++; try { - return new HoodieRecord(key, generateRandomValue(key, randomString)); + return new HoodieAvroRecord(key, generateRandomValue(key, randomString)); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } @@ -165,7 +166,7 @@ public List generateInserts(Integer n) throws IOException { } public HoodieRecord generateUpdateRecord(HoodieKey key, String randomString) throws IOException { - return new HoodieRecord(key, generateRandomValue(key, randomString)); + return new HoodieAvroRecord(key, generateRandomValue(key, randomString)); } /** diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/payload/AWSDmsAvroPayload.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/payload/AWSDmsAvroPayload.java index d0e1326761076..0eba1d9a6a4bc 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/payload/AWSDmsAvroPayload.java +++ b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/payload/AWSDmsAvroPayload.java @@ -26,6 +26,7 @@ import org.apache.avro.generic.IndexedRecord; import java.io.IOException; +import java.util.Properties; /** * Provides support for seamlessly applying changes captured via Amazon Database Migration Service onto S3. @@ -68,12 +69,25 @@ private Option handleDeleteOperation(IndexedRecord insertValue) t return delete ? Option.empty() : Option.of(insertValue); } + @Override + public Option getInsertValue(Schema schema, Properties properties) throws IOException { + IndexedRecord insertValue = super.getInsertValue(schema, properties).get(); + return handleDeleteOperation(insertValue); + } + @Override public Option getInsertValue(Schema schema) throws IOException { IndexedRecord insertValue = super.getInsertValue(schema).get(); return handleDeleteOperation(insertValue); } + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema, Properties properties) + throws IOException { + IndexedRecord insertValue = super.getInsertValue(schema, properties).get(); + return handleDeleteOperation(insertValue); + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala new file mode 100644 index 0000000000000..df2a953752fa8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Call.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical + +import org.apache.spark.sql.catalyst.expressions.Expression + +case class CallCommand(name: Seq[String], args: Seq[CallArgument]) extends Command { + override def children: Seq[LogicalPlan] = Seq.empty + + def withNewChildrenInternal(newChildren: IndexedSeq[LogicalPlan]): CallCommand = { + this + } +} + +/** + * An argument in a CALL statement. + */ +sealed trait CallArgument { + def expr: Expression +} + +/** + * An argument in a CALL statement identified by name. + */ +case class NamedArgument(name: String, expr: Expression) extends CallArgument + +/** + * An argument in a CALL statement identified by position. + */ +case class PositionalArgument(expr: Expression) extends CallArgument diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala index a198d0e009af2..048ca4ec6e758 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlUtils.scala @@ -19,10 +19,8 @@ package org.apache.spark.sql.hudi import org.apache.hudi.SparkAdapterSupport import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.expressions.{And, Cast, Expression, Literal} +import org.apache.spark.sql.catalyst.expressions.{And, Expression} import org.apache.spark.sql.catalyst.plans.logical.{MergeIntoTable, SubqueryAlias} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DataType, NullType} object HoodieSqlUtils extends SparkAdapterSupport { @@ -50,12 +48,4 @@ object HoodieSqlUtils extends SparkAdapterSupport { case exp => Seq(exp) } } - - def castIfNeeded(child: Expression, dataType: DataType, conf: SQLConf): Expression = { - child match { - case Literal(nul, NullType) => Literal(nul, dataType) - case _ => if (child.dataType != dataType) - Cast(child, dataType, Option(conf.sessionLocalTimeZone)) else child - } - } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala index c8fa32891e0f9..28f8a92e94405 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieAnalysis.scala @@ -17,24 +17,26 @@ package org.apache.spark.sql.hudi.analysis -import org.apache.hudi.{HoodieSparkUtils, SparkAdapterSupport} import org.apache.hudi.DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL import org.apache.hudi.common.model.HoodieRecord -import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.util.ReflectionUtils +import org.apache.hudi.{HoodieSparkUtils, SparkAdapterSupport} import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, Literal, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, GenericInternalRow, Literal, NamedExpression} import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation} -import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getTableIdentifier, getTableLocation, isHoodieTable, removeMetaFields, tableExistsInPath} +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{getTableIdentifier, removeMetaFields} import org.apache.spark.sql.hudi.HoodieSqlUtils._ import org.apache.spark.sql.hudi.command._ -import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlCommonUtils, HoodieSqlUtils} +import org.apache.spark.sql.hudi.command.procedures.{HoodieProcedures, Procedure, ProcedureArgs} +import org.apache.spark.sql.hudi.{HoodieOptionConfig, HoodieSqlCommonUtils} import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{AnalysisException, SparkSession} +import java.util import scala.collection.JavaConverters._ object HoodieAnalysis { @@ -42,16 +44,44 @@ object HoodieAnalysis { Seq( session => HoodieResolveReferences(session), session => HoodieAnalysis(session) - ) + ) ++ extraResolutionRules() def customPostHocResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] = Seq( session => HoodiePostAnalysisRule(session) - ) + ) ++ extraPostHocResolutionRules() + + def extraResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] = { + if (HoodieSparkUtils.gteqSpark3_2) { + val spark3AnalysisClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3Analysis" + val spark3Analysis: SparkSession => Rule[LogicalPlan] = + session => ReflectionUtils.loadClass(spark3AnalysisClass, session).asInstanceOf[Rule[LogicalPlan]] + + val spark3ResolveReferences = "org.apache.spark.sql.hudi.analysis.HoodieSpark3ResolveReferences" + val spark3References: SparkSession => Rule[LogicalPlan] = + session => ReflectionUtils.loadClass(spark3ResolveReferences, session).asInstanceOf[Rule[LogicalPlan]] + + Seq(spark3Analysis, spark3References) + } else { + Seq.empty + } + } + + def extraPostHocResolutionRules(): Seq[SparkSession => Rule[LogicalPlan]] = + if (HoodieSparkUtils.gteqSpark3_2) { + val spark3PostHocResolutionClass = "org.apache.spark.sql.hudi.analysis.HoodieSpark3PostAnalysisRule" + val spark3PostHocResolution: SparkSession => Rule[LogicalPlan] = + session => ReflectionUtils.loadClass(spark3PostHocResolutionClass, session).asInstanceOf[Rule[LogicalPlan]] + + Seq(spark3PostHocResolution) + } else { + Seq.empty + } } /** * Rule for convert the logical plan to command. + * * @param sparkSession */ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan] @@ -61,36 +91,36 @@ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan] plan match { // Convert to MergeIntoHoodieTableCommand case m @ MergeIntoTable(target, _, _, _, _) - if m.resolved && isHoodieTable(target, sparkSession) => + if m.resolved && sparkAdapter.isHoodieTable(target, sparkSession) => MergeIntoHoodieTableCommand(m) // Convert to UpdateHoodieTableCommand case u @ UpdateTable(table, _, _) - if u.resolved && isHoodieTable(table, sparkSession) => + if u.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => UpdateHoodieTableCommand(u) // Convert to DeleteHoodieTableCommand case d @ DeleteFromTable(table, _) - if d.resolved && isHoodieTable(table, sparkSession) => + if d.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => DeleteHoodieTableCommand(d) // Convert to InsertIntoHoodieTableCommand case l if sparkAdapter.isInsertInto(l) => val (table, partition, query, overwrite, _) = sparkAdapter.getInsertIntoChildren(l).get table match { - case relation: LogicalRelation if isHoodieTable(relation, sparkSession) => + case relation: LogicalRelation if sparkAdapter.isHoodieTable(relation, sparkSession) => new InsertIntoHoodieTableCommand(relation, query, partition, overwrite) case _ => l } // Convert to CreateHoodieTableAsSelectCommand case CreateTable(table, mode, Some(query)) - if query.resolved && isHoodieTable(table) => + if query.resolved && sparkAdapter.isHoodieTable(table) => CreateHoodieTableAsSelectCommand(table, mode, query) // Convert to CompactionHoodieTableCommand case CompactionTable(table, operation, options) - if table.resolved && isHoodieTable(table, sparkSession) => + if table.resolved && sparkAdapter.isHoodieTable(table, sparkSession) => val tableId = getTableIdentifier(table) val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(tableId) CompactionHoodieTableCommand(catalogTable, operation, options) @@ -99,33 +129,76 @@ case class HoodieAnalysis(sparkSession: SparkSession) extends Rule[LogicalPlan] CompactionHoodiePathCommand(path, operation, options) // Convert to CompactionShowOnTable case CompactionShowOnTable(table, limit) - if isHoodieTable(table, sparkSession) => + if sparkAdapter.isHoodieTable(table, sparkSession) => val tableId = getTableIdentifier(table) val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(tableId) CompactionShowHoodieTableCommand(catalogTable, limit) // Convert to CompactionShowHoodiePathCommand case CompactionShowOnPath(path, limit) => CompactionShowHoodiePathCommand(path, limit) - case _=> plan + // Convert to HoodieCallProcedureCommand + case c@CallCommand(_, _) => + val procedure: Option[Procedure] = loadProcedure(c.name) + val input = buildProcedureArgs(c.args) + if (procedure.nonEmpty) { + CallProcedureHoodieCommand(procedure.get, input) + } else { + c + } + case _ => plan + } + } + + private def loadProcedure(name: Seq[String]): Option[Procedure] = { + val procedure: Option[Procedure] = if (name.nonEmpty) { + val builder = HoodieProcedures.newBuilder(name.last) + if (builder != null) { + Option(builder.build) + } else { + throw new AnalysisException(s"procedure: ${name.last} is not exists") + } + } else { + None } + procedure + } + + private def buildProcedureArgs(exprs: Seq[CallArgument]): ProcedureArgs = { + val values = new Array[Any](exprs.size) + var isNamedArgs: Boolean = false + val map = new util.LinkedHashMap[String, Int]() + for (index <- exprs.indices) { + exprs(index) match { + case expr: NamedArgument => + map.put(expr.name, index) + values(index) = expr.expr.eval() + isNamedArgs = true + case _ => + map.put(index.toString, index) + values(index) = exprs(index).expr.eval() + isNamedArgs = false + } + } + ProcedureArgs(isNamedArgs, map, new GenericInternalRow(values)) } } /** * Rule for resolve hoodie's extended syntax or rewrite some logical plan. + * * @param sparkSession */ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[LogicalPlan] with SparkAdapterSupport { private lazy val analyzer = sparkSession.sessionState.analyzer - def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { // Resolve merge into case mergeInto @ MergeIntoTable(target, source, mergeCondition, matchedActions, notMatchedActions) - if isHoodieTable(target, sparkSession) && target.resolved => - + if sparkAdapter.isHoodieTable(target, sparkSession) && target.resolved => val resolver = sparkSession.sessionState.conf.resolver val resolvedSource = analyzer.execute(source) + def isInsertOrUpdateStar(assignments: Seq[Assignment]): Boolean = { if (assignments.isEmpty) { true @@ -277,7 +350,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi // Resolve update table case UpdateTable(table, assignments, condition) - if isHoodieTable(table, sparkSession) && table.resolved => + if sparkAdapter.isHoodieTable(table, sparkSession) && table.resolved => // Resolve condition val resolvedCondition = condition.map(resolveExpressionFrom(table)(_)) // Resolve assignments @@ -291,7 +364,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi // Resolve Delete Table case DeleteFromTable(table, condition) - if isHoodieTable(table, sparkSession) && table.resolved => + if sparkAdapter.isHoodieTable(table, sparkSession) && table.resolved => // Resolve condition val resolvedCondition = condition.map(resolveExpressionFrom(table)(_)) // Return the resolved DeleteTable @@ -303,7 +376,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi val (table, partition, query, overwrite, ifPartitionNotExists) = sparkAdapter.getInsertIntoChildren(l).get - if (isHoodieTable(table, sparkSession) && query.resolved && + if (sparkAdapter.isHoodieTable(table, sparkSession) && query.resolved && !containUnResolvedStar(query) && !checkAlreadyAppendMetaField(query)) { val metaFields = HoodieRecord.HOODIE_META_COLUMNS.asScala.map( @@ -336,6 +409,7 @@ case class HoodieResolveReferences(sparkSession: SparkSession) extends Rule[Logi /** * Check if the the query of insert statement has already append the meta fields to avoid * duplicate append. + * * @param query * @return */ @@ -401,37 +475,37 @@ case class HoodiePostAnalysisRule(sparkSession: SparkSession) extends Rule[Logic plan match { // Rewrite the CreateDataSourceTableCommand to CreateHoodieTableCommand case CreateDataSourceTableCommand(table, ignoreIfExists) - if isHoodieTable(table) => + if sparkAdapter.isHoodieTable(table) => CreateHoodieTableCommand(table, ignoreIfExists) // Rewrite the DropTableCommand to DropHoodieTableCommand case DropTableCommand(tableName, ifExists, isView, purge) - if isHoodieTable(tableName, sparkSession) => + if sparkAdapter.isHoodieTable(tableName, sparkSession) => DropHoodieTableCommand(tableName, ifExists, isView, purge) // Rewrite the AlterTableDropPartitionCommand to AlterHoodieTableDropPartitionCommand case AlterTableDropPartitionCommand(tableName, specs, ifExists, purge, retainData) - if isHoodieTable(tableName, sparkSession) => + if sparkAdapter.isHoodieTable(tableName, sparkSession) => AlterHoodieTableDropPartitionCommand(tableName, specs, ifExists, purge, retainData) // Rewrite the AlterTableRenameCommand to AlterHoodieTableRenameCommand // Rewrite the AlterTableAddColumnsCommand to AlterHoodieTableAddColumnsCommand case AlterTableAddColumnsCommand(tableId, colsToAdd) - if isHoodieTable(tableId, sparkSession) => + if sparkAdapter.isHoodieTable(tableId, sparkSession) => AlterHoodieTableAddColumnsCommand(tableId, colsToAdd) // Rewrite the AlterTableRenameCommand to AlterHoodieTableRenameCommand case AlterTableRenameCommand(oldName, newName, isView) - if !isView && isHoodieTable(oldName, sparkSession) => + if !isView && sparkAdapter.isHoodieTable(oldName, sparkSession) => new AlterHoodieTableRenameCommand(oldName, newName, isView) // Rewrite the AlterTableChangeColumnCommand to AlterHoodieTableChangeColumnCommand case AlterTableChangeColumnCommand(tableName, columnName, newColumn) - if isHoodieTable(tableName, sparkSession) => + if sparkAdapter.isHoodieTable(tableName, sparkSession) => AlterHoodieTableChangeColumnCommand(tableName, columnName, newColumn) // SPARK-34238: the definition of ShowPartitionsCommand has been changed in Spark3.2. // Match the class type instead of call the `unapply` method. case s: ShowPartitionsCommand - if isHoodieTable(s.tableName, sparkSession) => + if sparkAdapter.isHoodieTable(s.tableName, sparkSession) => ShowHoodieTablePartitionsCommand(s.tableName, s.spec) // Rewrite TruncateTableCommand to TruncateHoodieTableCommand case TruncateTableCommand(tableName, partitionSpec) - if isHoodieTable(tableName, sparkSession) => + if sparkAdapter.isHoodieTable(tableName, sparkSession) => new TruncateHoodieTableCommand(tableName, partitionSpec) case _ => plan } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CallProcedureHoodieCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CallProcedureHoodieCommand.scala new file mode 100644 index 0000000000000..f63f4115e9195 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CallProcedureHoodieCommand.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command + +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.hudi.command.procedures.{Procedure, ProcedureArgs} +import org.apache.spark.sql.{Row, SparkSession} + +import scala.collection.Seq + +case class CallProcedureHoodieCommand( + procedure: Procedure, + args: ProcedureArgs) extends HoodieLeafRunnableCommand { + + override def output: Seq[Attribute] = procedure.outputType.toAttributes + + override def run(sparkSession: SparkSession): Seq[Row] = { + procedure.call(args) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodiePathCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodiePathCommand.scala index 1363fb939b4e3..2f5c4d004f58f 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodiePathCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CompactionHoodiePathCommand.scala @@ -17,14 +17,13 @@ package org.apache.spark.sql.hudi.command -import org.apache.hudi.client.WriteStatus -import org.apache.hudi.common.model.HoodieTableType +import org.apache.hudi.common.model.{HoodieCommitMetadata, HoodieTableType} import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline} import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.common.util.{HoodieTimer, Option => HOption} import org.apache.hudi.exception.HoodieException import org.apache.hudi.{DataSourceUtils, DataSourceWriteOptions, HoodieWriterUtils} -import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} +import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation import org.apache.spark.sql.catalyst.plans.logical.CompactionOperation.{CompactionOperation, RUN, SCHEDULE} @@ -100,8 +99,8 @@ case class CompactionHoodiePathCommand(path: String, timer.startTimer() willCompactionInstants.foreach {compactionInstant => val writeResponse = client.compact(compactionInstant) - handlerResponse(writeResponse) - client.commitCompaction(compactionInstant, writeResponse, HOption.empty()) + handleResponse(writeResponse.getCommitMetadata.get()) + client.commitCompaction(compactionInstant, writeResponse.getCommitMetadata.get(), HOption.empty()) } logInfo(s"Finish Run compaction at instants: [${willCompactionInstants.mkString(",")}]," + s" spend: ${timer.endTimer()}ms") @@ -111,17 +110,13 @@ case class CompactionHoodiePathCommand(path: String, } } - private def handlerResponse(writeResponse: JavaRDD[WriteStatus]): Unit = { + private def handleResponse(metadata: HoodieCommitMetadata): Unit = { + // Handle error - val error = writeResponse.rdd.filter(f => f.hasErrors).take(1).headOption - if (error.isDefined) { - if (error.get.hasGlobalError) { - throw error.get.getGlobalError - } else if (!error.get.getErrors.isEmpty) { - val key = error.get.getErrors.asScala.head._1 - val exception = error.get.getErrors.asScala.head._2 - throw new HoodieException(s"Error in write record: $key", exception) - } + val writeStats = metadata.getPartitionToWriteStats.entrySet().flatMap(e => e.getValue).toList + val errorsCount = writeStats.map(state => state.getTotalWriteErrors).sum + if (errorsCount > 0) { + throw new HoodieException(s" Found $errorsCount when writing record") } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala index 572013981d698..2877dd8d9ee94 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/CreateHoodieTableAsSelectCommand.scala @@ -19,17 +19,14 @@ package org.apache.spark.sql.hudi.command import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path - import org.apache.hudi.DataSourceWriteOptions import org.apache.hudi.hive.util.ConfigUtils import org.apache.hudi.sql.InsertMode - -import org.apache.spark.sql.{Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, HoodieCatalogTable} +import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} -import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.command.DataWritingCommand import org.apache.spark.sql.hudi.HoodieSqlCommonUtils +import org.apache.spark.sql.{Row, SaveMode, SparkSession} import scala.collection.JavaConverters._ @@ -40,6 +37,7 @@ case class CreateHoodieTableAsSelectCommand( table: CatalogTable, mode: SaveMode, query: LogicalPlan) extends HoodieLeafRunnableCommand { + override def innerChildren: Seq[QueryPlan[_]] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { assert(table.tableType != CatalogTableType.VIEW) diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala index 030d3e3c623ca..f058b47d782d5 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/InsertIntoHoodieTableCommand.scala @@ -17,27 +17,18 @@ package org.apache.spark.sql.hudi.command -import org.apache.hudi.DataSourceWriteOptions._ -import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload -import org.apache.hudi.config.HoodieWriteConfig -import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME -import org.apache.hudi.hive.MultiPartKeysValueExtractor -import org.apache.hudi.hive.ddl.HiveSyncMode -import org.apache.hudi.keygen.ComplexKeyGenerator -import org.apache.hudi.sql.InsertMode -import org.apache.hudi.{DataSourceWriteOptions, HoodieSparkSqlWriter} +import org.apache.hudi.HoodieSparkSqlWriter import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable} import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} +import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ -import org.apache.spark.sql.hudi.HoodieSqlUtils.castIfNeeded +import org.apache.spark.sql.hudi.ProvidesHoodieConfig import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} -import scala.collection.JavaConverters._ - /** * Command for insert into hoodie table. */ @@ -47,6 +38,7 @@ case class InsertIntoHoodieTableCommand( partition: Map[String, Option[String]], overwrite: Boolean) extends HoodieLeafRunnableCommand { + override def innerChildren: Seq[QueryPlan[_]] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { assert(logicalRelation.catalogTable.isDefined, "Missing catalog table") @@ -57,7 +49,7 @@ case class InsertIntoHoodieTableCommand( } } -object InsertIntoHoodieTableCommand extends Logging { +object InsertIntoHoodieTableCommand extends Logging with ProvidesHoodieConfig { /** * Run the insert query. We support both dynamic partition insert and static partition insert. * @param sparkSession The spark session. @@ -174,112 +166,4 @@ object InsertIntoHoodieTableCommand extends Logging { val alignedProjects = dataProjectsWithoutMetaFields ++ partitionProjects Project(alignedProjects, query) } - - /** - * Build the default config for insert. - * @return - */ - private def buildHoodieInsertConfig( - hoodieCatalogTable: HoodieCatalogTable, - sparkSession: SparkSession, - isOverwrite: Boolean, - insertPartitions: Map[String, Option[String]] = Map.empty, - extraOptions: Map[String, String]): Map[String, String] = { - - if (insertPartitions.nonEmpty && - (insertPartitions.keys.toSet != hoodieCatalogTable.partitionFields.toSet)) { - throw new IllegalArgumentException(s"Insert partition fields" + - s"[${insertPartitions.keys.mkString(" " )}]" + - s" not equal to the defined partition in table[${hoodieCatalogTable.partitionFields.mkString(",")}]") - } - val path = hoodieCatalogTable.tableLocation - val tableType = hoodieCatalogTable.tableTypeName - val tableConfig = hoodieCatalogTable.tableConfig - val tableSchema = hoodieCatalogTable.tableSchema - - val options = hoodieCatalogTable.catalogProperties ++ tableConfig.getProps.asScala.toMap ++ extraOptions - val parameters = withSparkConf(sparkSession, options)() - - val preCombineColumn = hoodieCatalogTable.preCombineKey.getOrElse("") - val partitionFields = hoodieCatalogTable.partitionFields.mkString(",") - - val hiveStylePartitioningEnable = Option(tableConfig.getHiveStylePartitioningEnable).getOrElse("true") - val urlEncodePartitioning = Option(tableConfig.getUrlEncodePartitioning).getOrElse("false") - val keyGeneratorClassName = Option(tableConfig.getKeyGeneratorClassName) - .getOrElse(classOf[ComplexKeyGenerator].getCanonicalName) - - val enableBulkInsert = parameters.getOrElse(DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key, - DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.defaultValue()).toBoolean - val dropDuplicate = sparkSession.conf - .getOption(INSERT_DROP_DUPS.key).getOrElse(INSERT_DROP_DUPS.defaultValue).toBoolean - - val insertMode = InsertMode.of(parameters.getOrElse(DataSourceWriteOptions.SQL_INSERT_MODE.key, - DataSourceWriteOptions.SQL_INSERT_MODE.defaultValue())) - val isNonStrictMode = insertMode == InsertMode.NON_STRICT - val isPartitionedTable = hoodieCatalogTable.partitionFields.nonEmpty - val hasPreCombineColumn = preCombineColumn.nonEmpty - val operation = - (enableBulkInsert, isOverwrite, dropDuplicate, isNonStrictMode, isPartitionedTable) match { - case (true, _, _, false, _) => - throw new IllegalArgumentException(s"Table with primaryKey can not use bulk insert in ${insertMode.value()} mode.") - case (true, true, _, _, true) => - throw new IllegalArgumentException(s"Insert Overwrite Partition can not use bulk insert.") - case (true, _, true, _, _) => - throw new IllegalArgumentException(s"Bulk insert cannot support drop duplication." + - s" Please disable $INSERT_DROP_DUPS and try again.") - // if enableBulkInsert is true, use bulk insert for the insert overwrite non-partitioned table. - case (true, true, _, _, false) => BULK_INSERT_OPERATION_OPT_VAL - // insert overwrite table - case (false, true, _, _, false) => INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL - // insert overwrite partition - case (_, true, _, _, true) => INSERT_OVERWRITE_OPERATION_OPT_VAL - // disable dropDuplicate, and provide preCombineKey, use the upsert operation for strict and upsert mode. - case (false, false, false, false, _) if hasPreCombineColumn => UPSERT_OPERATION_OPT_VAL - // if table is pk table and has enableBulkInsert use bulk insert for non-strict mode. - case (true, _, _, true, _) => BULK_INSERT_OPERATION_OPT_VAL - // for the rest case, use the insert operation - case _ => INSERT_OPERATION_OPT_VAL - } - - val payloadClassName = if (operation == UPSERT_OPERATION_OPT_VAL && - tableType == COW_TABLE_TYPE_OPT_VAL && insertMode == InsertMode.STRICT) { - // Only validate duplicate key for COW, for MOR it will do the merge with the DefaultHoodieRecordPayload - // on reading. - classOf[ValidateDuplicateKeyPayload].getCanonicalName - } else { - classOf[OverwriteWithLatestAvroPayload].getCanonicalName - } - logInfo(s"insert statement use write operation type: $operation, payloadClass: $payloadClassName") - - val enableHive = isEnableHive(sparkSession) - withSparkConf(sparkSession, options) { - Map( - "path" -> path, - TABLE_TYPE.key -> tableType, - TBL_NAME.key -> hoodieCatalogTable.tableName, - PRECOMBINE_FIELD.key -> preCombineColumn, - OPERATION.key -> operation, - HIVE_STYLE_PARTITIONING.key -> hiveStylePartitioningEnable, - URL_ENCODE_PARTITIONING.key -> urlEncodePartitioning, - KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, - SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> keyGeneratorClassName, - RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), - PARTITIONPATH_FIELD.key -> partitionFields, - PAYLOAD_CLASS_NAME.key -> payloadClassName, - ENABLE_ROW_WRITER.key -> enableBulkInsert.toString, - HoodieWriteConfig.COMBINE_BEFORE_INSERT.key -> String.valueOf(hasPreCombineColumn), - META_SYNC_ENABLED.key -> enableHive.toString, - HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), - HIVE_USE_JDBC.key -> "false", - HIVE_DATABASE.key -> hoodieCatalogTable.table.identifier.database.getOrElse("default"), - HIVE_TABLE.key -> hoodieCatalogTable.table.identifier.table, - HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", - HIVE_PARTITION_FIELDS.key -> partitionFields, - HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, - HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key -> "200", - HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", - SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL - ) - } - } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala index 2c76ad567f58b..1d9aedd2af6fa 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/MergeIntoHoodieTableCommand.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BoundReference, Cast, EqualTo, Expression, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ -import org.apache.spark.sql.hudi.HoodieSqlUtils.{castIfNeeded, getMergeIntoTargetTableId} +import org.apache.spark.sql.hudi.HoodieSqlUtils.getMergeIntoTargetTableId import org.apache.spark.sql.hudi.SerDeUtils import org.apache.spark.sql.hudi.command.payload.ExpressionPayload import org.apache.spark.sql.hudi.command.payload.ExpressionPayload._ @@ -443,13 +443,18 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie val partitionColumns = tableConfig.getPartitionFieldProp.split(",").map(_.toLowerCase) val partitionSchema = StructType(tableSchema.filter(f => partitionColumns.contains(f.name))) + // NOTE: Here we fallback to "" to make sure that null value is not overridden with + // default value ("ts") + // TODO(HUDI-3456) clean up + val preCombineField = hoodieCatalogTable.preCombineKey.getOrElse("") + // Enable the hive sync by default if spark have enable the hive metastore. val enableHive = isEnableHive(sparkSession) withSparkConf(sparkSession, hoodieCatalogTable.catalogProperties) { Map( "path" -> path, RECORDKEY_FIELD.key -> tableConfig.getRecordKeyFieldProp, - PRECOMBINE_FIELD.key -> hoodieCatalogTable.preCombineKey.getOrElse(""), + PRECOMBINE_FIELD.key -> preCombineField, TBL_NAME.key -> hoodieCatalogTable.tableName, PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, PAYLOAD_CLASS_NAME.key -> classOf[ExpressionPayload].getCanonicalName, @@ -470,6 +475,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Hoodie HoodieWriteConfig.DELETE_PARALLELISM_VALUE.key -> "200", SqlKeyGenerator.PARTITION_SCHEMA -> partitionSchema.toDDL ) + .filter { case (_, v) => v != null } } } } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala index 512e9a18bd560..277f2643423dd 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/UpdateHoodieTableCommand.scala @@ -17,26 +17,21 @@ package org.apache.spark.sql.hudi.command -import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.SparkAdapterSupport import org.apache.hudi.common.model.HoodieRecord -import org.apache.hudi.config.HoodieWriteConfig -import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME -import org.apache.hudi.hive.MultiPartKeysValueExtractor -import org.apache.hudi.hive.ddl.HiveSyncMode import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{Assignment, UpdateTable} import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ -import org.apache.spark.sql.hudi.HoodieSqlUtils.castIfNeeded +import org.apache.spark.sql.hudi.ProvidesHoodieConfig import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructField import scala.collection.JavaConverters._ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends HoodieLeafRunnableCommand - with SparkAdapterSupport { + with SparkAdapterSupport with ProvidesHoodieConfig { private val table = updateTable.table private val tableId = getTableIdentifier(table) @@ -72,7 +67,7 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends HoodieLeaf df = df.filter(Column(updateTable.condition.get)) } df = df.select(projects: _*) - val config = buildHoodieConfig(sparkSession) + val config = buildHoodieConfig(HoodieCatalogTable(sparkSession, tableId)) df.write .format("hudi") .mode(SaveMode.Append) @@ -83,42 +78,6 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends HoodieLeaf Seq.empty[Row] } - private def buildHoodieConfig(sparkSession: SparkSession): Map[String, String] = { - val hoodieCatalogTable = HoodieCatalogTable(sparkSession, tableId) - val catalogProperties = hoodieCatalogTable.catalogProperties - val tableConfig = hoodieCatalogTable.tableConfig - - val preCombineColumn = Option(tableConfig.getPreCombineField).getOrElse("") - assert(hoodieCatalogTable.primaryKeys.nonEmpty, - s"There are no primary key in table $tableId, cannot execute update operator") - val enableHive = isEnableHive(sparkSession) - - withSparkConf(sparkSession, catalogProperties) { - Map( - "path" -> hoodieCatalogTable.tableLocation, - RECORDKEY_FIELD.key -> hoodieCatalogTable.primaryKeys.mkString(","), - PRECOMBINE_FIELD.key -> preCombineColumn, - TBL_NAME.key -> hoodieCatalogTable.tableName, - HIVE_STYLE_PARTITIONING.key -> tableConfig.getHiveStylePartitioningEnable, - URL_ENCODE_PARTITIONING.key -> tableConfig.getUrlEncodePartitioning, - KEYGENERATOR_CLASS_NAME.key -> classOf[SqlKeyGenerator].getCanonicalName, - SqlKeyGenerator.ORIGIN_KEYGEN_CLASS_NAME -> tableConfig.getKeyGeneratorClassName, - OPERATION.key -> UPSERT_OPERATION_OPT_VAL, - PARTITIONPATH_FIELD.key -> tableConfig.getPartitionFieldProp, - META_SYNC_ENABLED.key -> enableHive.toString, - HIVE_SYNC_MODE.key -> HiveSyncMode.HMS.name(), - HIVE_USE_JDBC.key -> "false", - HIVE_DATABASE.key -> tableId.database.getOrElse("default"), - HIVE_TABLE.key -> tableId.table, - HIVE_PARTITION_FIELDS.key -> tableConfig.getPartitionFieldProp, - HIVE_PARTITION_EXTRACTOR_CLASS.key -> classOf[MultiPartKeysValueExtractor].getCanonicalName, - HIVE_SUPPORT_TIMESTAMP_TYPE.key -> "true", - HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key -> "200", - SqlKeyGenerator.PARTITION_SCHEMA -> hoodieCatalogTable.partitionSchema.toDDL - ) - } - } - def cast(exp:Expression, field: StructField, sqlConf: SQLConf): Expression = { castIfNeeded(exp, field.dataType, sqlConf) } diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala new file mode 100644 index 0000000000000..e64df997da2ff --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/BaseProcedure.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.client.SparkRDDWriteClient +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.model.HoodieRecordPayload +import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig} +import org.apache.hudi.index.HoodieIndex.IndexType +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types._ + +abstract class BaseProcedure extends Procedure { + val INVALID_ARG_INDEX: Int = -1 + + val spark: SparkSession = SparkSession.active + val jsc = new JavaSparkContext(spark.sparkContext) + + protected def sparkSession: SparkSession = spark + + protected def createHoodieClient(jsc: JavaSparkContext, basePath: String): SparkRDDWriteClient[_ <: HoodieRecordPayload[_ <: AnyRef]] = { + val config = getWriteConfig(basePath) + new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config) + } + + protected def getWriteConfig(basePath: String): HoodieWriteConfig = { + HoodieWriteConfig.newBuilder + .withPath(basePath) + .withIndexConfig(HoodieIndexConfig.newBuilder.withIndexType(IndexType.BLOOM).build) + .withRollbackUsingMarkers(false) + .build + } + + protected def checkArgs(target: Array[ProcedureParameter], args: ProcedureArgs): Unit = { + val internalRow = args.internalRow + for (i <- target.indices) { + if (target(i).required) { + var argsIndex: Integer = null + if (args.isNamedArgs) { + argsIndex = getArgsIndex(target(i).name, args) + } else { + argsIndex = getArgsIndex(i.toString, args) + } + assert(-1 != argsIndex && internalRow.get(argsIndex, target(i).dataType) != null, + s"Argument: ${target(i).name} is required") + } + } + } + + protected def getArgsIndex(key: String, args: ProcedureArgs): Integer = { + args.map.getOrDefault(key, INVALID_ARG_INDEX) + } + + protected def getArgValueOrDefault(args: ProcedureArgs, parameter: ProcedureParameter): Any = { + var argsIndex: Int = INVALID_ARG_INDEX + if (args.isNamedArgs) { + argsIndex = getArgsIndex(parameter.name, args) + } else { + argsIndex = getArgsIndex(parameter.index.toString, args) + } + if (argsIndex.equals(INVALID_ARG_INDEX)) parameter.default else getInternalRowValue(args.internalRow, argsIndex, parameter.dataType) + } + + protected def getInternalRowValue(row: InternalRow, index: Int, dataType: DataType): Any = { + dataType match { + case StringType => row.getString(index) + case BinaryType => row.getBinary(index) + case BooleanType => row.getBoolean(index) + case CalendarIntervalType => row.getInterval(index) + case DoubleType => row.getDouble(index) + case d: DecimalType => row.getDecimal(index, d.precision, d.scale) + case FloatType => row.getFloat(index) + case ByteType => row.getByte(index) + case IntegerType => row.getInt(index) + case LongType => row.getLong(index) + case ShortType => row.getShort(index) + case NullType => null + case _ => + throw new UnsupportedOperationException(s"type: ${dataType.typeName} not supported") + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala new file mode 100644 index 0000000000000..7b919fcef08b5 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import com.google.common.collect.ImmutableMap + +import java.util +import java.util.Locale +import java.util.function.Supplier + +object HoodieProcedures { + private val BUILDERS: util.Map[String, Supplier[ProcedureBuilder]] = initProcedureBuilders + + def newBuilder(name: String): ProcedureBuilder = { + val builderSupplier: Supplier[ProcedureBuilder] = BUILDERS.get(name.toLowerCase(Locale.ROOT)) + if (builderSupplier != null) builderSupplier.get else null + } + + private def initProcedureBuilders: util.Map[String, Supplier[ProcedureBuilder]] = { + val mapBuilder: ImmutableMap.Builder[String, Supplier[ProcedureBuilder]] = ImmutableMap.builder() + mapBuilder.put(ShowCommitsProcedure.NAME, ShowCommitsProcedure.builder) + mapBuilder.put(ShowCommitsMetadataProcedure.NAME, ShowCommitsMetadataProcedure.builder) + mapBuilder.put(RollbackToInstantTimeProcedure.NAME, RollbackToInstantTimeProcedure.builder) + mapBuilder.build + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/Procedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/Procedure.scala new file mode 100644 index 0000000000000..f34e306159827 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/Procedure.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types.StructType + +import java.util +import scala.collection.mutable + +/** + * An interface representing a stored procedure available for execution. + */ +trait Procedure { + /** + * Returns the input parameters of this procedure. + */ + def parameters: Array[ProcedureParameter] + + /** + * Returns the type of rows produced by this procedure. + */ + def outputType: StructType + + /** + * Executes this procedure. + *

    + * Spark will align the provided arguments according to the input parameters + * defined in {@link #parameters ( )} either by position or by name before execution. + *

    + * Implementations may provide a summary of execution by returning one or many rows + * as a result. The schema of output rows must match the defined output type + * in {@link #outputType ( )}. + * + * @param args input arguments + * @return the result of executing this procedure with the given arguments + */ + def call(args: ProcedureArgs): Seq[Row] + + /** + * Returns the description of this procedure. + */ + def description: String = this.getClass.toString +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureArgs.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureArgs.scala new file mode 100644 index 0000000000000..5c462c1b892a0 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureArgs.scala @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.catalyst.InternalRow + +import java.util + +case class ProcedureArgs(isNamedArgs: Boolean, + map: util.LinkedHashMap[String, Int], + internalRow: InternalRow) { +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureBuilder.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureBuilder.scala new file mode 100644 index 0000000000000..b2ecd0a3089c4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureBuilder.scala @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +trait ProcedureBuilder { + def build: Procedure +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameter.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameter.scala new file mode 100644 index 0000000000000..a9ad252bd7a05 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameter.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.types.DataType + +/** + * An input parameter of a {@link Procedure stored procedure}. + */ +abstract class ProcedureParameter { + def index: Int + + /** + * Returns the name of this parameter. + */ + def name: String + + /** + * Returns the type of this parameter. + */ + def dataType: DataType + + /** + * Returns true if this parameter is required. + */ + def required: Boolean + + /** + * this parameter's default value. + */ + def default: Any +} + +object ProcedureParameter { + /** + * Creates a required input parameter. + * + * @param name the name of the parameter + * @param dataType the type of the parameter + * @return the constructed stored procedure parameter + */ + def required(index: Int, name: String, dataType: DataType, default: Any): ProcedureParameterImpl = { + ProcedureParameterImpl(index, name, dataType, default, required = true) + } + + /** + * Creates an optional input parameter. + * + * @param name the name of the parameter. + * @param dataType the type of the parameter. + * @return the constructed optional stored procedure parameter + */ + def optional(index: Int, name: String, dataType: DataType, default: Any): ProcedureParameterImpl = { + ProcedureParameterImpl(index, name, dataType, default, required = false) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameterImpl.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameterImpl.scala new file mode 100644 index 0000000000000..a7f4117047457 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedureParameterImpl.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.spark.sql.types.DataType + +import java.util.Objects + +case class ProcedureParameterImpl(index: Int, name: String, dataType: DataType, default: Any, required: Boolean) + extends ProcedureParameter { + + override def equals(other: Any): Boolean = { + val that = other.asInstanceOf[ProcedureParameterImpl] + val rtn = if (this == other) { + true + } else if (other == null || (getClass ne other.getClass)) { + false + } else { + index == that.index && required == that.required && default == that.default && Objects.equals(name, that.name) && Objects.equals(dataType, that.dataType) + } + rtn + } + + override def hashCode: Int = Seq(index, name, dataType, required, default).hashCode() + + override def toString: String = s"ProcedureParameter(index='$index',name='$name', type=$dataType, required=$required, default=$default)" +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToInstantTimeProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToInstantTimeProcedure.scala new file mode 100644 index 0000000000000..5414e8db6b37d --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RollbackToInstantTimeProcedure.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.HoodieTimeline +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion +import org.apache.hudi.common.util.Option +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util.function.Supplier + +class RollbackToInstantTimeProcedure extends BaseProcedure with ProcedureBuilder { + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.required(1, "instant_time", DataTypes.StringType, None)) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("rollback_result", DataTypes.BooleanType, nullable = true, Metadata.empty)) + ) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).asInstanceOf[String] + val instantTime = getArgValueOrDefault(args, PARAMETERS(1)).asInstanceOf[String] + + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, new TableIdentifier(table)) + val basePath = hoodieCatalogTable.tableLocation + val client = createHoodieClient(jsc, basePath) + val config = getWriteConfig(basePath) + val metaClient = HoodieTableMetaClient.builder + .setConf(jsc.hadoopConfiguration) + .setBasePath(config.getBasePath) + .setLoadActiveTimelineOnLoad(false) + .setConsistencyGuardConfig(config.getConsistencyGuardConfig) + .setLayoutVersion(Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion))) + .build + + val activeTimeline = metaClient.getActiveTimeline + val completedTimeline: HoodieTimeline = activeTimeline.getCommitsTimeline.filterCompletedInstants + val filteredTimeline = completedTimeline.containsInstant(instantTime) + if (!filteredTimeline) { + throw new HoodieException(s"Commit $instantTime not found in Commits $completedTimeline") + } + + val result = if (client.rollback(instantTime)) true else false + val outputRow = Row(result) + + Seq(outputRow) + } + + override def build: Procedure = new RollbackToInstantTimeProcedure() +} + +object RollbackToInstantTimeProcedure { + val NAME: String = "rollback_to_instant" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get(): RollbackToInstantTimeProcedure = new RollbackToInstantTimeProcedure() + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitsProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitsProcedure.scala new file mode 100644 index 0000000000000..da089baba9cb6 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowCommitsProcedure.scala @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.command.procedures + +import org.apache.hudi.common.model.HoodieCommitMetadata +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.common.table.timeline.{HoodieDefaultTimeline, HoodieInstant} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.HoodieCatalogTable +import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} + +import java.util +import java.util.Collections +import java.util.function.Supplier +import scala.collection.JavaConverters._ + +class ShowCommitsProcedure(includeExtraMetadata: Boolean) extends BaseProcedure with ProcedureBuilder { + var sortByFieldParameter: ProcedureParameter = _ + + private val PARAMETERS = Array[ProcedureParameter]( + ProcedureParameter.required(0, "table", DataTypes.StringType, None), + ProcedureParameter.optional(1, "limit", DataTypes.IntegerType, 10) + ) + + private val OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("commit_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_files_added", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_files_updated", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_partitions_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_records_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_update_records_written", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_errors", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + private val METADATA_OUTPUT_TYPE = new StructType(Array[StructField]( + StructField("commit_time", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("action", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("file_id", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("previous_commit", DataTypes.StringType, nullable = true, Metadata.empty), + StructField("num_writes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_inserts", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_deletes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("num_update_writes", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_errors", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_log_blocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_corrupt_logblocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_rollback_blocks", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_log_records", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_updated_records_compacted", DataTypes.LongType, nullable = true, Metadata.empty), + StructField("total_bytes_written", DataTypes.LongType, nullable = true, Metadata.empty) + )) + + def parameters: Array[ProcedureParameter] = PARAMETERS + + def outputType: StructType = if (includeExtraMetadata) METADATA_OUTPUT_TYPE else OUTPUT_TYPE + + override def call(args: ProcedureArgs): Seq[Row] = { + super.checkArgs(PARAMETERS, args) + + val table = getArgValueOrDefault(args, PARAMETERS(0)).asInstanceOf[String] + val limit = getArgValueOrDefault(args, PARAMETERS(1)).asInstanceOf[Int] + + val hoodieCatalogTable = HoodieCatalogTable(sparkSession, new TableIdentifier(table)) + val basePath = hoodieCatalogTable.tableLocation + val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build + + val activeTimeline = metaClient.getActiveTimeline + if (includeExtraMetadata) { + getCommitsWithMetadata(activeTimeline, limit) + } else { + getCommits(activeTimeline, limit) + } + } + + override def build: Procedure = new ShowCommitsProcedure(includeExtraMetadata) + + private def getCommitsWithMetadata(timeline: HoodieDefaultTimeline, + limit: Int): Seq[Row] = { + import scala.collection.JavaConversions._ + + val (rows: util.ArrayList[Row], newCommits: util.ArrayList[HoodieInstant]) = getSortCommits(timeline) + + for (i <- 0 until newCommits.size) { + val commit = newCommits.get(i) + val commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get, classOf[HoodieCommitMetadata]) + for (partitionWriteStat <- commitMetadata.getPartitionToWriteStats.entrySet) { + for (hoodieWriteStat <- partitionWriteStat.getValue) { + rows.add(Row( + commit.getTimestamp, commit.getAction, hoodieWriteStat.getPartitionPath, + hoodieWriteStat.getFileId, hoodieWriteStat.getPrevCommit, hoodieWriteStat.getNumWrites, + hoodieWriteStat.getNumInserts, hoodieWriteStat.getNumDeletes, hoodieWriteStat.getNumUpdateWrites, + hoodieWriteStat.getTotalWriteErrors, hoodieWriteStat.getTotalLogBlocks, hoodieWriteStat.getTotalCorruptLogBlock, + hoodieWriteStat.getTotalRollbackBlocks, hoodieWriteStat.getTotalLogRecords, + hoodieWriteStat.getTotalUpdatedRecordsCompacted, hoodieWriteStat.getTotalWriteBytes)) + } + } + } + + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } + + private def getSortCommits(timeline: HoodieDefaultTimeline): (util.ArrayList[Row], util.ArrayList[HoodieInstant]) = { + val rows = new util.ArrayList[Row] + // timeline can be read from multiple files. So sort is needed instead of reversing the collection + val commits: util.List[HoodieInstant] = timeline.getCommitsTimeline.filterCompletedInstants + .getInstants.toArray().map(instant => instant.asInstanceOf[HoodieInstant]).toList.asJava + val newCommits = new util.ArrayList[HoodieInstant](commits) + Collections.sort(newCommits, HoodieInstant.COMPARATOR.reversed) + (rows, newCommits) + } + + def getCommits(timeline: HoodieDefaultTimeline, + limit: Int): Seq[Row] = { + val (rows: util.ArrayList[Row], newCommits: util.ArrayList[HoodieInstant]) = getSortCommits(timeline) + + for (i <- 0 until newCommits.size) { + val commit = newCommits.get(i) + val commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get, classOf[HoodieCommitMetadata]) + rows.add(Row(commit.getTimestamp, commitMetadata.fetchTotalBytesWritten, commitMetadata.fetchTotalFilesInsert, + commitMetadata.fetchTotalFilesUpdated, commitMetadata.fetchTotalPartitionsWritten, + commitMetadata.fetchTotalRecordsWritten, commitMetadata.fetchTotalUpdateRecordsWritten, + commitMetadata.fetchTotalWriteErrors)) + } + + rows.stream().limit(limit).toArray().map(r => r.asInstanceOf[Row]).toList + } +} + +object ShowCommitsProcedure { + val NAME = "show_commits" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowCommitsProcedure(false) + } +} + +object ShowCommitsMetadataProcedure { + val NAME = "show_commits_metadata" + + def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] { + override def get() = new ShowCommitsProcedure(true) + } +} + + diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala index b1f5a32fe1e19..3146740b1f3f5 100644 --- a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala +++ b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/parser/HoodieSqlCommonAstBuilder.scala @@ -17,22 +17,39 @@ package org.apache.spark.sql.parser +import org.antlr.v4.runtime.ParserRuleContext +import org.antlr.v4.runtime.tree.{ParseTree, RuleNode, TerminalNode} import org.apache.hudi.SparkAdapterSupport -import org.apache.hudi.spark.sql.parser.{HoodieSqlCommonBaseVisitor, HoodieSqlCommonParser} -import org.apache.hudi.spark.sql.parser.HoodieSqlCommonParser.{CompactionOnPathContext, CompactionOnTableContext, ShowCompactionOnPathContext, ShowCompactionOnTableContext, SingleStatementContext, TableIdentifierContext} +import org.apache.hudi.spark.sql.parser.HoodieSqlCommonBaseVisitor +import org.apache.hudi.spark.sql.parser.HoodieSqlCommonParser._ import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation -import org.apache.spark.sql.catalyst.parser.ParserUtils.withOrigin -import org.apache.spark.sql.catalyst.parser.{ParserInterface, ParserUtils} -import org.apache.spark.sql.catalyst.plans.logical.{CompactionOperation, CompactionPath, CompactionShowOnPath, CompactionShowOnTable, CompactionTable, LogicalPlan} +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} +import org.apache.spark.sql.catalyst.parser.{ParseException, ParserInterface, ParserUtils} +import org.apache.spark.sql.catalyst.plans.logical._ + +import scala.collection.JavaConverters._ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface) extends HoodieSqlCommonBaseVisitor[AnyRef] with Logging with SparkAdapterSupport { import ParserUtils._ + /** + * Override the default behavior for all visit methods. This will only return a non-null result + * when the context has only one child. This is done because there is no generic method to + * combine the results of the context children. In all other cases null is returned. + */ + override def visitChildren(node: RuleNode): AnyRef = { + if (node.getChildCount == 1) { + node.getChild(0).accept(this) + } else { + null + } + } + override def visitSingleStatement(ctx: SingleStatementContext): LogicalPlan = withOrigin(ctx) { ctx.statement().accept(this).asInstanceOf[LogicalPlan] } @@ -72,4 +89,62 @@ class HoodieSqlCommonAstBuilder(session: SparkSession, delegate: ParserInterface override def visitTableIdentifier(ctx: TableIdentifierContext): LogicalPlan = withOrigin(ctx) { UnresolvedRelation(TableIdentifier(ctx.table.getText, Option(ctx.db).map(_.getText))) } + + override def visitCall(ctx: CallContext): LogicalPlan = withOrigin(ctx) { + if (ctx.callArgument().isEmpty) { + throw new ParseException(s"Procedures arguments is empty", ctx) + } + + val name: Seq[String] = ctx.multipartIdentifier().parts.asScala.map(_.getText) + val args: Seq[CallArgument] = ctx.callArgument().asScala.map(typedVisit[CallArgument]) + CallCommand(name, args) + } + + /** + * Return a multi-part identifier as Seq[String]. + */ + override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] = withOrigin(ctx) { + ctx.parts.asScala.map(_.getText) + } + + /** + * Create a positional argument in a stored procedure call. + */ + override def visitPositionalArgument(ctx: PositionalArgumentContext): CallArgument = withOrigin(ctx) { + val expr = typedVisit[Expression](ctx.expression) + PositionalArgument(expr) + } + + /** + * Create a named argument in a stored procedure call. + */ + override def visitNamedArgument(ctx: NamedArgumentContext): CallArgument = withOrigin(ctx) { + val name = ctx.identifier.getText + val expr = typedVisit[Expression](ctx.expression) + NamedArgument(name, expr) + } + + def visitConstant(ctx: ConstantContext): Literal = { + delegate.parseExpression(ctx.getText).asInstanceOf[Literal] + } + + override def visitExpression(ctx: ExpressionContext): Expression = { + // reconstruct the SQL string and parse it using the main Spark parser + // while we can avoid the logic to build Spark expressions, we still have to parse them + // we cannot call ctx.getText directly since it will not render spaces correctly + // that's why we need to recurse down the tree in reconstructSqlString + val sqlString = reconstructSqlString(ctx) + delegate.parseExpression(sqlString) + } + + private def reconstructSqlString(ctx: ParserRuleContext): String = { + ctx.children.asScala.map { + case c: ParserRuleContext => reconstructSqlString(c) + case t: TerminalNode => t.getText + }.mkString(" ") + } + + private def typedVisit[T](ctx: ParseTree): T = { + ctx.accept(this).asInstanceOf[T] + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java index f1e6b45b292b7..5baaffab0cf7c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/HoodieJavaStreamingApp.java @@ -164,7 +164,7 @@ public void run() throws Exception { ExecutorService executor = Executors.newFixedThreadPool(2); int numInitialCommits = 0; - // thread for spark strucutured streaming + // thread for spark structured streaming try { Future streamFuture = executor.submit(() -> { LOG.info("===== Streaming Starting ====="); @@ -211,7 +211,7 @@ public void run() throws Exception { Dataset inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2)); executor = Executors.newFixedThreadPool(2); - // thread for spark strucutured streaming + // thread for spark structured streaming try { Future streamFuture = executor.submit(() -> { LOG.info("===== Streaming Starting ====="); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 0c5a2122d509a..bf3520f0956d7 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -191,7 +191,7 @@ public void testDoWriteOperationWithUserDefinedBulkInsertPartitioner() throws Ho @Test public void testCreateUserDefinedBulkInsertPartitionerRowsWithInValidPartitioner() throws HoodieException { - config = HoodieWriteConfig.newBuilder().withPath("/").withUserDefinedBulkInsertPartitionerClass("NonExistantUserDefinedClass").build(); + config = HoodieWriteConfig.newBuilder().withPath("/").withUserDefinedBulkInsertPartitionerClass("NonExistentUserDefinedClass").build(); Exception exception = assertThrows(HoodieException.class, () -> { DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(config); diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java index 2e89baa70b8bf..d2257f58d0e80 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestBootstrap.java @@ -32,6 +32,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; @@ -103,7 +104,6 @@ import static java.util.stream.Collectors.mapping; import static java.util.stream.Collectors.toList; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.generateGenericRecord; import static org.apache.spark.sql.functions.callUDF; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -178,7 +178,7 @@ public Schema generateNewDataSetAndReturnSchema(long timestamp, int numRecords, } @Test - public void testMetadataBootstrapUnpartitionedCOW() throws Exception { + public void testMetadataBootstrapNonpartitionedCOW() throws Exception { testBootstrapCommon(false, false, EffectiveMode.METADATA_BOOTSTRAP_MODE); } @@ -228,7 +228,7 @@ private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, Effec bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs); break; default: - bootstrapModeSelectorClass = TestRandomBootstapModeSelector.class.getName(); + bootstrapModeSelectorClass = TestRandomBootstrapModeSelector.class.getName(); bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS; checkNumRawFiles = false; isBootstrapIndexCreated = true; @@ -252,7 +252,6 @@ private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, Effec .withFullBootstrapInputProvider(TestFullBootstrapDataProvider.class.getName()) .withBootstrapParallelism(3) .withBootstrapModeSelector(bootstrapModeSelectorClass).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .build(); SparkRDDWriteClient client = new SparkRDDWriteClient(context, config); client.bootstrap(Option.empty()); @@ -510,7 +509,7 @@ private static JavaRDD generateInputBatch(JavaSparkContext jsc, try { String key = gr.get("_row_key").toString(); String pPath = p.getKey(); - return new HoodieRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, + return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); @@ -522,11 +521,11 @@ private static JavaRDD generateInputBatch(JavaSparkContext jsc, }).collect(Collectors.toList())); } - public static class TestRandomBootstapModeSelector extends BootstrapModeSelector { + public static class TestRandomBootstrapModeSelector extends BootstrapModeSelector { private int currIdx = new Random().nextInt(2); - public TestRandomBootstapModeSelector(HoodieWriteConfig writeConfig) { + public TestRandomBootstrapModeSelector(HoodieWriteConfig writeConfig) { super(writeConfig); } @@ -564,8 +563,7 @@ public static Dataset generateTestRawTripDataset(long timestamp, int from, final List records = new ArrayList<>(); IntStream.range(from, to).forEach(i -> { String id = "" + i; - records.add(generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, - timestamp, false, false).toString()); + records.add(new HoodieTestDataGenerator().generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, timestamp, false, false).toString()); }); if (isPartitioned) { sqlContext.udf().register("partgen", diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java index fba09091add50..9146cdc4e81f7 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestOrcBootstrap.java @@ -29,8 +29,8 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.bootstrap.FileStatusUtils; import org.apache.hudi.common.bootstrap.index.BootstrapIndex; -import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -98,7 +98,6 @@ import static java.util.stream.Collectors.mapping; import static java.util.stream.Collectors.toList; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.generateGenericRecord; import static org.apache.spark.sql.functions.callUDF; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -171,7 +170,7 @@ public Schema generateNewDataSetAndReturnSchema(long timestamp, int numRecords, } @Test - public void testMetadataBootstrapUnpartitionedCOW() throws Exception { + public void testMetadataBootstrapNonpartitionedCOW() throws Exception { testBootstrapCommon(false, false, EffectiveMode.METADATA_BOOTSTRAP_MODE); } @@ -221,7 +220,7 @@ private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, Effec bootstrapInstants = Arrays.asList(bootstrapCommitInstantTs); break; default: - bootstrapModeSelectorClass = TestRandomBootstapModeSelector.class.getName(); + bootstrapModeSelectorClass = TestRandomBootstrapModeSelector.class.getName(); bootstrapCommitInstantTs = HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS; checkNumRawFiles = false; isBootstrapIndexCreated = true; @@ -245,7 +244,6 @@ private void testBootstrapCommon(boolean partitioned, boolean deltaCommit, Effec .withFullBootstrapInputProvider(TestFullBootstrapDataProvider.class.getName()) .withBootstrapParallelism(3) .withBootstrapModeSelector(bootstrapModeSelectorClass).build()) - .withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(false).build()) .build(); SparkRDDWriteClient client = new SparkRDDWriteClient(context, config); client.bootstrap(Option.empty()); @@ -425,7 +423,7 @@ private static JavaRDD generateInputBatch(JavaSparkContext jsc, try { String key = gr.get("_row_key").toString(); String pPath = p.getKey(); - return new HoodieRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, + return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); @@ -437,10 +435,10 @@ private static JavaRDD generateInputBatch(JavaSparkContext jsc, }).collect(Collectors.toList())); } - public static class TestRandomBootstapModeSelector extends BootstrapModeSelector { + public static class TestRandomBootstrapModeSelector extends BootstrapModeSelector { private int currIdx = new Random().nextInt(2); - public TestRandomBootstapModeSelector(HoodieWriteConfig writeConfig) { + public TestRandomBootstrapModeSelector(HoodieWriteConfig writeConfig) { super(writeConfig); } @@ -477,8 +475,7 @@ public static Dataset generateTestRawTripDataset(long timestamp, int from, final List records = new ArrayList<>(); IntStream.range(from, to).forEach(i -> { String id = "" + i; - records.add(generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, - timestamp, false, false).toString()); + records.add(new HoodieTestDataGenerator().generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, timestamp, false, false).toString()); }); if (isPartitioned) { sqlContext.udf().register("partgen", diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java similarity index 99% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java rename to hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java index 87deef2a58c4e..735277d959ee4 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestComplexKeyGenerator.java @@ -18,13 +18,12 @@ package org.apache.hudi.keygen; +import org.apache.avro.generic.GenericRecord; import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.exception.HoodieKeyException; - -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.testutils.KeyGeneratorTestUtilities; import org.apache.spark.sql.Row; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java similarity index 99% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java rename to hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java index 4b590d9374c8e..26a2b439abfb2 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestCustomKeyGenerator.java @@ -18,10 +18,9 @@ package org.apache.hudi.keygen; -import org.apache.hudi.common.model.HoodieKey; -import org.apache.hudi.common.config.TypedProperties; - import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorType; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java similarity index 99% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java rename to hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java index aa9568b7a4663..a0d90e028af82 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestGlobalDeleteKeyGenerator.java @@ -18,11 +18,10 @@ package org.apache.hudi.keygen; +import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.exception.HoodieKeyException; - -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.testutils.KeyGeneratorTestUtilities; import org.apache.spark.sql.Row; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java similarity index 99% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java rename to hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java index 0760de112b934..297b077794d56 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestNonpartitionedKeyGenerator.java @@ -18,12 +18,11 @@ package org.apache.hudi.keygen; +import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.exception.HoodieKeyException; - -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.testutils.KeyGeneratorTestUtilities; import org.apache.spark.sql.Row; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java similarity index 99% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java rename to hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java index 0fc90c83a08d4..7dea9e414e693 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestSimpleKeyGenerator.java @@ -18,13 +18,12 @@ package org.apache.hudi.keygen; +import org.apache.avro.generic.GenericRecord; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.exception.HoodieKeyException; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.testutils.KeyGeneratorTestUtilities; - -import org.apache.avro.generic.GenericRecord; import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.InternalRow; import org.junit.jupiter.api.Assertions; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java similarity index 75% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java rename to hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java index 6f3c1a39f81ff..1fc4b9f1ef694 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/TestTimestampBasedKeyGenerator.java @@ -18,18 +18,18 @@ package org.apache.hudi.keygen; -import org.apache.hudi.AvroConversionHelper; +import org.apache.avro.Conversions; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.testutils.SchemaTestUtil; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.testutils.KeyGeneratorTestUtilities; - import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; @@ -37,12 +37,12 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import scala.Function1; +import scala.Tuple2; import java.io.IOException; import java.math.BigDecimal; -import scala.Function1; - import static org.junit.jupiter.api.Assertions.assertEquals; public class TestTimestampBasedKeyGenerator { @@ -69,21 +69,9 @@ public void initialize() throws IOException { properties.setProperty(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key(), "false"); } - private TypedProperties getBaseKeyConfig(String timestampType, String dateFormat, String timezone, String scalarType) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TYPE_FIELD_PROP, timestampType); - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, dateFormat); - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, timezone); - - if (scalarType != null) { - properties.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit", scalarType); - } - - return properties; - } - private Row genericRecordToRow(GenericRecord baseRecord) { - Function1 convertor = AvroConversionHelper.createConverterToRow(baseRecord.getSchema(), structType); - Row row = (Row) convertor.apply(baseRecord); + Function1 convertor = AvroConversionUtils.createConverterToRow(baseRecord.getSchema(), structType); + Row row = convertor.apply(baseRecord); int fieldCount = structType.fieldNames().length; Object[] values = new Object[fieldCount]; for (int i = 0; i < fieldCount; i++) { @@ -92,24 +80,49 @@ private Row genericRecordToRow(GenericRecord baseRecord) { return new GenericRowWithSchema(values, structType); } - private TypedProperties getBaseKeyConfig(String timestampType, String inputFormatList, String inputFormatDelimiterRegex, String inputTimezone, String outputFormat, String outputTimezone) { + private TypedProperties getBaseKeyConfig(String partitionPathField, String timestampType, String dateFormat, String timezone, String scalarType) { + TypedProperties properties = new TypedProperties(this.properties); + + properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), partitionPathField); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, timestampType); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, dateFormat); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, timezone); + + if (scalarType != null) { + properties.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.scalar.time.unit", scalarType); + } + + return properties; + } + + private TypedProperties getBaseKeyConfig(String partitionPathField, + String timestampType, + String inputFormatList, + String inputFormatDelimiterRegex, + String inputTimezone, + String outputFormat, + String outputTimezone) { + TypedProperties properties = new TypedProperties(this.properties); + + properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), partitionPathField); + if (timestampType != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_TYPE_FIELD_PROP, timestampType); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_TYPE_FIELD_PROP, timestampType); } if (inputFormatList != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, inputFormatList); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, inputFormatList); } if (inputFormatDelimiterRegex != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, inputFormatDelimiterRegex); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMITER_REGEX_PROP, inputFormatDelimiterRegex); } if (inputTimezone != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, inputTimezone); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, inputTimezone); } if (outputFormat != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, outputFormat); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, outputFormat); } if (outputTimezone != null) { - properties.setProperty(TimestampBasedAvroKeyGenerator.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, outputTimezone); + properties.setProperty(KeyGeneratorOptions.Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, outputTimezone); } return properties; } @@ -118,7 +131,7 @@ private TypedProperties getBaseKeyConfig(String timestampType, String inputForma public void testTimestampBasedKeyGenerator() throws IOException { // timezone is GMT+8:00 baseRecord.put("createTime", 1578283932000L); - properties = getBaseKeyConfig("EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); + properties = getBaseKeyConfig("createTime", "EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); TimestampBasedKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk1 = keyGen.getKey(baseRecord); assertEquals("2020-01-06 12", hk1.getPartitionPath()); @@ -128,68 +141,60 @@ public void testTimestampBasedKeyGenerator() throws IOException { assertEquals("2020-01-06 12", keyGen.getPartitionPath(internalRow, baseRow.schema())); // timezone is GMT+8:00, createTime is BigDecimal - baseRecord.put("createTime", new BigDecimal(1578283932000.00001)); - properties = getBaseKeyConfig("EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); + BigDecimal decimal = new BigDecimal("1578283932000.0001"); + Conversions.DecimalConversion conversion = new Conversions.DecimalConversion(); + Tuple2 resolvedNullableSchema = AvroConversionUtils.resolveAvroTypeNullability(schema.getField("createTimeDecimal").schema()); + GenericFixed avroDecimal = conversion.toFixed(decimal, resolvedNullableSchema._2, LogicalTypes.decimal(20, 4)); + baseRecord.put("createTimeDecimal", avroDecimal); + properties = getBaseKeyConfig("createTimeDecimal", "EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey bigDecimalKey = keyGen.getKey(baseRecord); assertEquals("2020-01-06 12", bigDecimalKey.getPartitionPath()); - - // test w/ Row baseRow = genericRecordToRow(baseRecord); assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow)); // timezone is GMT - properties = getBaseKeyConfig("EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT", null); + properties = getBaseKeyConfig("createTime", "EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT", null); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk2 = keyGen.getKey(baseRecord); assertEquals("2020-01-06 04", hk2.getPartitionPath()); - - // test w/ Row assertEquals("2020-01-06 04", keyGen.getPartitionPath(baseRow)); // timestamp is DATE_STRING, timezone is GMT+8:00 - baseRecord.put("createTime", "2020-01-06 12:12:12"); - properties = getBaseKeyConfig("DATE_STRING", "yyyy-MM-dd hh", "GMT+8:00", null); + baseRecord.put("createTimeString", "2020-01-06 12:12:12"); + properties = getBaseKeyConfig("createTimeString", "DATE_STRING", "yyyy-MM-dd hh", "GMT+8:00", null); properties.setProperty("hoodie.deltastreamer.keygen.timebased.input.dateformat", "yyyy-MM-dd hh:mm:ss"); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk3 = keyGen.getKey(baseRecord); assertEquals("2020-01-06 12", hk3.getPartitionPath()); - - // test w/ Row baseRow = genericRecordToRow(baseRecord); assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow)); // timezone is GMT - properties = getBaseKeyConfig("DATE_STRING", "yyyy-MM-dd hh", "GMT", null); + properties = getBaseKeyConfig("createTimeString", "DATE_STRING", "yyyy-MM-dd hh", "GMT", null); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk4 = keyGen.getKey(baseRecord); assertEquals("2020-01-06 12", hk4.getPartitionPath()); - - // test w/ Row assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow)); // timezone is GMT+8:00, createTime is null baseRecord.put("createTime", null); - properties = getBaseKeyConfig("EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); + properties = getBaseKeyConfig("createTime", "EPOCHMILLISECONDS", "yyyy-MM-dd hh", "GMT+8:00", null); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk5 = keyGen.getKey(baseRecord); assertEquals("1970-01-01 08", hk5.getPartitionPath()); - - // test w/ Row baseRow = genericRecordToRow(baseRecord); assertEquals("1970-01-01 08", keyGen.getPartitionPath(baseRow)); internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow); assertEquals("1970-01-01 08", keyGen.getPartitionPath(internalRow, baseRow.schema())); // timestamp is DATE_STRING, timezone is GMT, createTime is null - baseRecord.put("createTime", null); - properties = getBaseKeyConfig("DATE_STRING", "yyyy-MM-dd hh:mm:ss", "GMT", null); + baseRecord.put("createTimeString", null); + properties = getBaseKeyConfig("createTime", "DATE_STRING", "yyyy-MM-dd hh:mm:ss", "GMT", null); properties.setProperty("hoodie.deltastreamer.keygen.timebased.input.dateformat", "yyyy-MM-dd hh:mm:ss"); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk6 = keyGen.getKey(baseRecord); assertEquals("1970-01-01 12:00:00", hk6.getPartitionPath()); - - // test w/ Row baseRow = genericRecordToRow(baseRecord); assertEquals("1970-01-01 12:00:00", keyGen.getPartitionPath(baseRow)); internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow); @@ -202,7 +207,7 @@ public void testScalar() throws IOException { baseRecord.put("createTime", 20000L); // timezone is GMT - properties = getBaseKeyConfig("SCALAR", "yyyy-MM-dd hh", "GMT", "days"); + properties = getBaseKeyConfig("createTime", "SCALAR", "yyyy-MM-dd hh", "GMT", "days"); TimestampBasedKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk1 = keyGen.getKey(baseRecord); assertEquals(hk1.getPartitionPath(), "2024-10-04 12"); @@ -215,7 +220,7 @@ public void testScalar() throws IOException { // timezone is GMT, createTime is null baseRecord.put("createTime", null); - properties = getBaseKeyConfig("SCALAR", "yyyy-MM-dd hh", "GMT", "days"); + properties = getBaseKeyConfig("createTime", "SCALAR", "yyyy-MM-dd hh", "GMT", "days"); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk2 = keyGen.getKey(baseRecord); assertEquals("1970-01-02 12", hk2.getPartitionPath()); @@ -227,8 +232,8 @@ public void testScalar() throws IOException { assertEquals("1970-01-02 12", keyGen.getPartitionPath(internalRow, baseRow.schema())); // timezone is GMT. number of days store integer in mysql - baseRecord.put("createTime", 18736); - properties = getBaseKeyConfig("SCALAR", "yyyy-MM-dd", "GMT", "DAYS"); + baseRecord.put("createTime", 18736L); + properties = getBaseKeyConfig("createTime", "SCALAR", "yyyy-MM-dd", "GMT", "DAYS"); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey scalarSecondsKey = keyGen.getKey(baseRecord); assertEquals("2021-04-19", scalarSecondsKey.getPartitionPath()); @@ -245,7 +250,7 @@ public void testScalarWithLogicalType() throws IOException { baseRecord = SchemaTestUtil.generateAvroRecordFromJson(schema, 1, "001", "f1"); baseRecord.put("createTime", 1638513806000000L); - properties = getBaseKeyConfig("SCALAR", "yyyy/MM/dd", "GMT", "MICROSECONDS"); + properties = getBaseKeyConfig("createTime", "SCALAR", "yyyy/MM/dd", "GMT", "MICROSECONDS"); properties.setProperty(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), "true"); TimestampBasedKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk1 = keyGen.getKey(baseRecord); @@ -259,7 +264,7 @@ public void testScalarWithLogicalType() throws IOException { // timezone is GMT, createTime is null baseRecord.put("createTime", null); - properties = getBaseKeyConfig("SCALAR", "yyyy/MM/dd", "GMT", "MICROSECONDS"); + properties = getBaseKeyConfig("createTime", "SCALAR", "yyyy/MM/dd", "GMT", "MICROSECONDS"); properties.setProperty(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), "true"); keyGen = new TimestampBasedKeyGenerator(properties); HoodieKey hk2 = keyGen.getKey(baseRecord); @@ -274,8 +279,9 @@ public void testScalarWithLogicalType() throws IOException { @Test public void test_ExpectsMatch_SingleInputFormat_ISO8601WithMsZ_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.428Z"); + baseRecord.put("createTimeString", "2020-04-01T13:01:33.428Z"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "", @@ -292,8 +298,9 @@ public void test_ExpectsMatch_SingleInputFormat_ISO8601WithMsZ_OutputTimezoneAsU @Test public void test_ExpectsMatch_SingleInputFormats_ISO8601WithMsZ_OutputTimezoneAsInputDateTimeZone() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.428Z"); + baseRecord.put("createTimeString", "2020-04-01T13:01:33.428Z"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ss.SSSZ", "", @@ -310,8 +317,9 @@ public void test_ExpectsMatch_SingleInputFormats_ISO8601WithMsZ_OutputTimezoneAs @Test public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsZ_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.428Z"); + baseRecord.put("createTimeString", "2020-04-01T13:01:33.428Z"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", "", @@ -328,8 +336,9 @@ public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsZ_OutputTimezone @Test public void test_ExpectsMatch_MultipleInputFormats_ISO8601NoMsZ_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33Z"); + baseRecord.put("createTimeString", "2020-04-01T13:01:33Z"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", "", @@ -346,8 +355,9 @@ public void test_ExpectsMatch_MultipleInputFormats_ISO8601NoMsZ_OutputTimezoneAs @Test public void test_ExpectsMatch_MultipleInputFormats_ISO8601NoMsWithOffset_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33-05:00"); + baseRecord.put("createTimeString", "2020-04-01T13:01:33-05:00"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", "", @@ -364,8 +374,9 @@ public void test_ExpectsMatch_MultipleInputFormats_ISO8601NoMsWithOffset_OutputT @Test public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsWithOffset_OutputTimezoneAsUTC() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.123-05:00"); + baseRecord.put("createTimeString", "2020-04-01T13:01:33.123-05:00"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", "", @@ -382,8 +393,9 @@ public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsWithOffset_Outpu @Test public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsZ_OutputTimezoneAsEST() throws IOException { - baseRecord.put("createTime", "2020-04-01T13:01:33.123Z"); + baseRecord.put("createTimeString", "2020-04-01T13:01:33.123Z"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", "", @@ -400,8 +412,9 @@ public void test_ExpectsMatch_MultipleInputFormats_ISO8601WithMsZ_OutputTimezone @Test public void test_Throws_MultipleInputFormats_InputDateNotMatchingFormats() throws IOException { - baseRecord.put("createTime", "2020-04-01 13:01:33.123-05:00"); + baseRecord.put("createTimeString", "2020-04-01 13:01:33.123-05:00"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ", "", @@ -417,8 +430,9 @@ public void test_Throws_MultipleInputFormats_InputDateNotMatchingFormats() throw @Test public void test_ExpectsMatch_MultipleInputFormats_ShortDate_OutputCustomDate() throws IOException { - baseRecord.put("createTime", "20200401"); + baseRecord.put("createTimeString", "20200401"); properties = this.getBaseKeyConfig( + "createTimeString", "DATE_STRING", "yyyy-MM-dd'T'HH:mm:ssZ,yyyy-MM-dd'T'HH:mm:ss.SSSZ,yyyyMMdd", "", diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/factory/TestCreateKeyGeneratorByTypeWithFactory.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/factory/TestCreateKeyGeneratorByTypeWithFactory.java similarity index 100% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/factory/TestCreateKeyGeneratorByTypeWithFactory.java rename to hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/factory/TestCreateKeyGeneratorByTypeWithFactory.java diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/factory/TestHoodieSparkKeyGeneratorFactory.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/factory/TestHoodieSparkKeyGeneratorFactory.java similarity index 98% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/factory/TestHoodieSparkKeyGeneratorFactory.java rename to hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/factory/TestHoodieSparkKeyGeneratorFactory.java index dffe1eaa96c24..816c1fb86d4b9 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/keygen/factory/TestHoodieSparkKeyGeneratorFactory.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/keygen/factory/TestHoodieSparkKeyGeneratorFactory.java @@ -25,8 +25,8 @@ import org.apache.hudi.keygen.SimpleKeyGenerator; import org.apache.hudi.keygen.TestComplexKeyGenerator; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; - import org.apache.hudi.keygen.constant.KeyGeneratorType; + import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -58,7 +58,7 @@ public void testKeyGeneratorFactory() throws IOException { // set both class name and keyGenerator type props.put(HoodieWriteConfig.KEYGENERATOR_TYPE.key(), KeyGeneratorType.CUSTOM.name()); KeyGenerator keyGenerator3 = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props); - // KEYGENERATOR_TYPE_PROP was overitten by KEYGENERATOR_CLASS_PROP + // KEYGENERATOR_TYPE_PROP was overwritten by KEYGENERATOR_CLASS_PROP Assertions.assertEquals(SimpleKeyGenerator.class.getName(), keyGenerator3.getClass().getName()); // set wrong class name diff --git a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/payload/TestAWSDmsAvroPayload.java b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/payload/TestAWSDmsAvroPayload.java index 802096a3a74e1..cf3d9a94d1be2 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/payload/TestAWSDmsAvroPayload.java +++ b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/payload/TestAWSDmsAvroPayload.java @@ -25,7 +25,6 @@ import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; - import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -98,7 +97,7 @@ public void testDelete() { try { Option outputPayload = payload.combineAndGetUpdateValue(oldRecord, avroSchema); - // expect nothing to be comitted to table + // expect nothing to be committed to table assertFalse(outputPayload.isPresent()); } catch (Exception e) { fail("Unexpected exception"); @@ -123,7 +122,7 @@ public void testPreCombineWithDelete() { try { OverwriteWithLatestAvroPayload output = payload.preCombine(insertPayload); Option outputPayload = output.getInsertValue(avroSchema); - // expect nothing to be comitted to table + // expect nothing to be committed to table assertFalse(outputPayload.isPresent()); } catch (Exception e) { fail("Unexpected exception"); diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..59b3ff043a8ec --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00000-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":770,"c2":" 770sdc","c3":335.770,"c4":"2021-11-18T23:34:44.201-08:00","c5":78,"c6":"2020-01-15","c7":"Ag==","c8":9} +{"c1":768,"c2":" 768sdc","c3":64.768,"c4":"2021-11-18T23:34:44.201-08:00","c5":78,"c6":"2020-10-13","c7":"AA==","c8":9} +{"c1":431,"c2":" 431sdc","c3":153.431,"c4":"2021-11-18T23:34:44.186-08:00","c5":44,"c6":"2020-03-12","c7":"rw==","c8":9} +{"c1":427,"c2":" 427sdc","c3":246.427,"c4":"2021-11-18T23:34:44.186-08:00","c5":44,"c6":"2020-10-08","c7":"qw==","c8":9} +{"c1":328,"c2":" 328sdc","c3":977.328,"c4":"2021-11-18T23:34:44.181-08:00","c5":34,"c6":"2020-10-21","c7":"SA==","c8":9} +{"c1":320,"c2":" 320sdc","c3":230.320,"c4":"2021-11-18T23:34:44.180-08:00","c5":33,"c6":"2020-02-13","c7":"QA==","c8":9} +{"c1":317,"c2":" 317sdc","c3":580.317,"c4":"2021-11-18T23:34:44.180-08:00","c5":33,"c6":"2020-10-10","c7":"PQ==","c8":9} +{"c1":308,"c2":" 308sdc","c3":375.308,"c4":"2021-11-18T23:34:44.180-08:00","c5":32,"c6":"2020-01-01","c7":"NA==","c8":9} +{"c1":304,"c2":" 304sdc","c3":904.304,"c4":"2021-11-18T23:34:44.179-08:00","c5":32,"c6":"2020-08-25","c7":"MA==","c8":9} +{"c1":300,"c2":" 300sdc","c3":398.300,"c4":"2021-11-18T23:34:44.179-08:00","c5":31,"c6":"2020-04-21","c7":"LA==","c8":9} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..c5a11067c9782 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00001-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":719,"c2":" 719sdc","c3":707.719,"c4":"2021-11-18T23:34:44.199-08:00","c5":73,"c6":"2020-05-20","c7":"zw==","c8":9} +{"c1":715,"c2":" 715sdc","c3":777.715,"c4":"2021-11-18T23:34:44.199-08:00","c5":73,"c6":"2020-01-16","c7":"yw==","c8":9} +{"c1":579,"c2":" 579sdc","c3":958.579,"c4":"2021-11-18T23:34:44.193-08:00","c5":59,"c6":"2020-08-20","c7":"Qw==","c8":9} +{"c1":568,"c2":" 568sdc","c3":667.568,"c4":"2021-11-18T23:34:44.193-08:00","c5":58,"c6":"2020-08-09","c7":"OA==","c8":9} +{"c1":367,"c2":" 367sdc","c3":791.367,"c4":"2021-11-18T23:34:44.183-08:00","c5":38,"c6":"2020-05-04","c7":"bw==","c8":9} +{"c1":364,"c2":" 364sdc","c3":264.364,"c4":"2021-11-18T23:34:44.183-08:00","c5":38,"c6":"2020-02-01","c7":"bA==","c8":9} +{"c1":250,"c2":" 250sdc","c3":624.250,"c4":"2021-11-18T23:34:44.176-08:00","c5":26,"c6":"2020-09-27","c7":"+g==","c8":9} +{"c1":249,"c2":" 249sdc","c3":579.249,"c4":"2021-11-18T23:34:44.176-08:00","c5":26,"c6":"2020-08-26","c7":"+Q==","c8":9} +{"c1":246,"c2":" 246sdc","c3":413.246,"c4":"2021-11-18T23:34:44.176-08:00","c5":26,"c6":"2020-05-23","c7":"9g==","c8":9} +{"c1":125,"c2":" 125sdc","c3":153.125,"c4":"2021-11-18T23:34:44.169-08:00","c5":14,"c6":"2020-05-14","c7":"fQ==","c8":9} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..585eb31329e62 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00002-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":486,"c2":" 486sdc","c3":278.486,"c4":"2021-11-18T23:34:44.189-08:00","c5":50,"c6":"2020-03-11","c7":"5g==","c8":9} +{"c1":483,"c2":" 483sdc","c3":162.483,"c4":"2021-11-18T23:34:44.189-08:00","c5":49,"c6":"2020-11-08","c7":"4w==","c8":9} +{"c1":224,"c2":" 224sdc","c3":294.224,"c4":"2021-11-18T23:34:44.175-08:00","c5":24,"c6":"2020-05-01","c7":"4A==","c8":9} +{"c1":118,"c2":" 118sdc","c3":204.118,"c4":"2021-11-18T23:34:44.168-08:00","c5":13,"c6":"2020-09-07","c7":"dg==","c8":9} +{"c1":111,"c2":" 111sdc","c3":82.111,"c4":"2021-11-18T23:34:44.168-08:00","c5":12,"c6":"2020-02-28","c7":"bw==","c8":9} +{"c1":79,"c2":" 79sdc","c3":198.790,"c4":"2021-11-18T23:34:44.166-08:00","c5":9,"c6":"2020-03-24","c7":"Tw==","c8":9} +{"c1":77,"c2":" 77sdc","c3":619.770,"c4":"2021-11-18T23:34:44.166-08:00","c5":9,"c6":"2020-01-22","c7":"TQ==","c8":9} +{"c1":76,"c2":" 76sdc","c3":315.760,"c4":"2021-11-18T23:34:44.166-08:00","c5":9,"c6":"2020-11-21","c7":"TA==","c8":9} +{"c1":60,"c2":" 60sdc","c3":326.600,"c4":"2021-11-18T23:34:44.164-08:00","c5":7,"c6":"2020-06-05","c7":"PA==","c8":9} +{"c1":59,"c2":" 59sdc","c3":771.590,"c4":"2021-11-18T23:34:44.164-08:00","c5":7,"c6":"2020-05-04","c7":"Ow==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json new file mode 100644 index 0000000000000..2e37e6a180eba --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/another-input-table-json/part-00003-7e680484-e7e1-48b6-8289-1a7c483b530b-c000.json @@ -0,0 +1,10 @@ +{"c1":272,"c2":" 272sdc","c3":979.272,"c4":"2021-11-18T23:34:44.178-08:00","c5":28,"c6":"2020-09-21","c7":"EA==","c8":9} +{"c1":258,"c2":" 258sdc","c3":627.258,"c4":"2021-11-18T23:34:44.177-08:00","c5":27,"c6":"2020-06-07","c7":"Ag==","c8":9} +{"c1":240,"c2":" 240sdc","c3":880.240,"c4":"2021-11-18T23:34:44.176-08:00","c5":25,"c6":"2020-10-17","c7":"8A==","c8":9} +{"c1":236,"c2":" 236sdc","c3":576.236,"c4":"2021-11-18T23:34:44.176-08:00","c5":25,"c6":"2020-06-13","c7":"7A==","c8":9} +{"c1":137,"c2":" 137sdc","c3":597.137,"c4":"2021-11-18T23:34:44.170-08:00","c5":15,"c6":"2020-06-26","c7":"iQ==","c8":9} +{"c1":134,"c2":" 134sdc","c3":802.134,"c4":"2021-11-18T23:34:44.170-08:00","c5":15,"c6":"2020-03-23","c7":"hg==","c8":9} +{"c1":131,"c2":" 131sdc","c3":959.131,"c4":"2021-11-18T23:34:44.169-08:00","c5":14,"c6":"2020-11-20","c7":"gw==","c8":9} +{"c1":129,"c2":" 129sdc","c3":430.129,"c4":"2021-11-18T23:34:44.169-08:00","c5":14,"c6":"2020-09-18","c7":"gQ==","c8":9} +{"c1":24,"c2":" 24sdc","c3":867.240,"c4":"2021-11-18T23:34:44.161-08:00","c5":4,"c6":"2020-03-25","c7":"GA==","c8":9} +{"c1":8,"c2":" 8sdc","c3":977.800,"c4":"2021-11-18T23:34:44.159-08:00","c5":2,"c6":"2020-09-09","c7":"CA==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00000-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00000-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json new file mode 100644 index 0000000000000..43d89698c40ba --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00000-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json @@ -0,0 +1,10 @@ +{"c1":323,"c2":" 323sdc","c3":738.323,"c4":"2021-11-19T20:40:55.522-08:00","c5":33,"c6":"2020-05-16","c7":"Qw==","c8":9} +{"c1":326,"c2":" 326sdc","c3":481.326,"c4":"2021-11-19T20:40:55.522-08:00","c5":34,"c6":"2020-08-19","c7":"Rg==","c8":9} +{"c1":555,"c2":" 555sdc","c3":791.555,"c4":"2021-11-19T20:40:55.535-08:00","c5":57,"c6":"2020-06-24","c7":"Kw==","c8":9} +{"c1":556,"c2":" 556sdc","c3":100.556,"c4":"2021-11-19T20:40:55.535-08:00","c5":57,"c6":"2020-07-25","c7":"LA==","c8":9} +{"c1":562,"c2":" 562sdc","c3":100.562,"c4":"2021-11-19T20:40:55.535-08:00","c5":57,"c6":"2020-02-03","c7":"Mg==","c8":9} +{"c1":619,"c2":" 619sdc","c3":284.619,"c4":"2021-11-19T20:40:55.537-08:00","c5":63,"c6":"2020-04-04","c7":"aw==","c8":9} +{"c1":624,"c2":" 624sdc","c3":783.624,"c4":"2021-11-19T20:40:55.537-08:00","c5":64,"c6":"2020-09-09","c7":"cA==","c8":9} +{"c1":633,"c2":" 633sdc","c3":706.633,"c4":"2021-11-19T20:40:55.538-08:00","c5":64,"c6":"2020-07-18","c7":"eQ==","c8":9} +{"c1":638,"c2":" 638sdc","c3":811.638,"c4":"2021-11-19T20:40:55.538-08:00","c5":65,"c6":"2020-01-23","c7":"fg==","c8":9} +{"c1":639,"c2":" 639sdc","c3":299.639,"c4":"2021-11-19T20:40:55.538-08:00","c5":65,"c6":"2020-02-24","c7":"fw==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00001-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00001-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json new file mode 100644 index 0000000000000..7537986a1f7cc --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00001-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json @@ -0,0 +1,10 @@ +{"c1":74,"c2":" 74sdc","c3":38.740,"c4":"2021-11-19T20:40:55.507-08:00","c5":9,"c6":"2020-09-19","c7":"Sg==","c8":9} +{"c1":181,"c2":" 181sdc","c3":754.181,"c4":"2021-11-19T20:40:55.514-08:00","c5":19,"c6":"2020-06-14","c7":"tQ==","c8":9} +{"c1":212,"c2":" 212sdc","c3":633.212,"c4":"2021-11-19T20:40:55.516-08:00","c5":22,"c6":"2020-04-17","c7":"1A==","c8":9} +{"c1":213,"c2":" 213sdc","c3":980.213,"c4":"2021-11-19T20:40:55.516-08:00","c5":22,"c6":"2020-05-18","c7":"1Q==","c8":9} +{"c1":428,"c2":" 428sdc","c3":550.428,"c4":"2021-11-19T20:40:55.528-08:00","c5":44,"c6":"2020-11-09","c7":"rA==","c8":9} +{"c1":429,"c2":" 429sdc","c3":799.429,"c4":"2021-11-19T20:40:55.528-08:00","c5":44,"c6":"2020-01-10","c7":"rQ==","c8":9} +{"c1":430,"c2":" 430sdc","c3":76.430,"c4":"2021-11-19T20:40:55.528-08:00","c5":44,"c6":"2020-02-11","c7":"rg==","c8":9} +{"c1":539,"c2":" 539sdc","c3":866.539,"c4":"2021-11-19T20:40:55.534-08:00","c5":55,"c6":"2020-01-08","c7":"Gw==","c8":9} +{"c1":552,"c2":" 552sdc","c3":382.552,"c4":"2021-11-19T20:40:55.535-08:00","c5":56,"c6":"2020-03-21","c7":"KA==","c8":9} +{"c1":559,"c2":" 559sdc","c3":699.559,"c4":"2021-11-19T20:40:55.535-08:00","c5":57,"c6":"2020-10-28","c7":"Lw==","c8":9} diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00002-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00002-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json new file mode 100644 index 0000000000000..7f171d3b7f575 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00002-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json @@ -0,0 +1,10 @@ +{"c1":355,"c2":" 355sdc","c3":994.355,"c4":"2021-11-19T20:40:55.524-08:00","c5":37,"c6":"2020-04-20","c7":"Yw==","c8":9} +{"c1":358,"c2":" 358sdc","c3":975.358,"c4":"2021-11-19T20:40:55.524-08:00","c5":37,"c6":"2020-07-23","c7":"Zg==","c8":9} +{"c1":769,"c2":" 769sdc","c3":919.769,"c4":"2021-11-19T20:40:55.543-08:00","c5":78,"c6":"2020-11-14","c7":"AQ==","c8":9} +{"c1":882,"c2":" 882sdc","c3":374.882,"c4":"2021-11-19T20:40:55.547-08:00","c5":89,"c6":"2020-03-15","c7":"cg==","c8":9} +{"c1":892,"c2":" 892sdc","c3":787.892,"c4":"2021-11-19T20:40:55.547-08:00","c5":90,"c6":"2020-02-25","c7":"fA==","c8":9} +{"c1":917,"c2":" 917sdc","c3":912.917,"c4":"2021-11-19T20:40:55.548-08:00","c5":93,"c6":"2020-05-22","c7":"lQ==","c8":9} +{"c1":932,"c2":" 932sdc","c3":990.932,"c4":"2021-11-19T20:40:55.549-08:00","c5":94,"c6":"2020-09-09","c7":"pA==","c8":9} +{"c1":933,"c2":" 933sdc","c3":510.933,"c4":"2021-11-19T20:40:55.549-08:00","c5":94,"c6":"2020-10-10","c7":"pQ==","c8":9} +{"c1":943,"c2":" 943sdc","c3":601.943,"c4":"2021-11-19T20:40:55.549-08:00","c5":95,"c6":"2020-09-20","c7":"rw==","c8":9} +{"c1":945,"c2":" 945sdc","c3":790.945,"c4":"2021-11-19T20:40:55.549-08:00","c5":96,"c6":"2020-11-22","c7":"sQ==","c8":9} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00003-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00003-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json new file mode 100644 index 0000000000000..48d91417b2c60 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/input-table-json/part-00003-4468afca-8a37-4ae8-a150-0c2fd3361080-c000.json @@ -0,0 +1,10 @@ +{"c1":0,"c2":" 0sdc","c3":19.000,"c4":"2021-11-19T20:40:55.339-08:00","c5":1,"c6":"2020-01-01","c7":"AA==","c8":9} +{"c1":89,"c2":" 89sdc","c3":759.890,"c4":"2021-11-19T20:40:55.508-08:00","c5":10,"c6":"2020-02-06","c7":"WQ==","c8":9} +{"c1":199,"c2":" 199sdc","c3":315.199,"c4":"2021-11-19T20:40:55.515-08:00","c5":21,"c6":"2020-02-04","c7":"xw==","c8":9} +{"c1":200,"c2":" 200sdc","c3":618.200,"c4":"2021-11-19T20:40:55.515-08:00","c5":21,"c6":"2020-03-05","c7":"yA==","c8":9} +{"c1":309,"c2":" 309sdc","c3":642.309,"c4":"2021-11-19T20:40:55.521-08:00","c5":32,"c6":"2020-02-02","c7":"NQ==","c8":9} +{"c1":318,"c2":" 318sdc","c3":106.318,"c4":"2021-11-19T20:40:55.522-08:00","c5":33,"c6":"2020-11-11","c7":"Pg==","c8":9} +{"c1":329,"c2":" 329sdc","c3":200.329,"c4":"2021-11-19T20:40:55.522-08:00","c5":34,"c6":"2020-11-22","c7":"SQ==","c8":9} +{"c1":690,"c2":" 690sdc","c3":854.690,"c4":"2021-11-19T20:40:55.540-08:00","c5":70,"c6":"2020-09-19","c7":"sg==","c8":9} +{"c1":697,"c2":" 697sdc","c3":916.697,"c4":"2021-11-19T20:40:55.540-08:00","c5":71,"c6":"2020-05-26","c7":"uQ==","c8":9} +{"c1":959,"c2":" 959sdc","c3":480.959,"c4":"2021-11-19T20:40:55.550-08:00","c5":97,"c6":"2020-03-08","c7":"vw==","c8":9} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/z-index-table-merged.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/z-index-table-merged.json index 5c876126ae1d6..00d16c660c503 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/z-index-table-merged.json +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/z-index-table-merged.json @@ -1,8 +1,8 @@ -{"c1_maxValue":1000,"c1_minValue":3,"c1_num_nulls":0,"c2_maxValue":" 993sdc","c2_minValue":" 1000sdc","c2_num_nulls":0,"c3_maxValue":999.348,"c3_minValue":5.102,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-27","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00000-1c8226c2-f2a0-455d-aedd-c544003b0b3d-c000.snappy.parquet"} -{"c1_maxValue":1000,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 996sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":999.779,"c3_minValue":2.992,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"/g==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00000-5034d84a-c4c8-4eba-85b5-a52f47e628a7-c000.snappy.parquet"} -{"c1_maxValue":998,"c1_minValue":2,"c1_num_nulls":0,"c2_maxValue":" 998sdc","c2_minValue":" 104sdc","c2_num_nulls":0,"c3_maxValue":997.905,"c3_minValue":0.876,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-02","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"Ag==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00001-1c8226c2-f2a0-455d-aedd-c544003b0b3d-c000.snappy.parquet"} -{"c1_maxValue":997,"c1_minValue":3,"c1_num_nulls":0,"c2_maxValue":" 9sdc","c2_minValue":" 102sdc","c2_num_nulls":0,"c3_maxValue":990.531,"c3_minValue":2.336,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-27","c6_minValue":"2020-01-02","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00001-5034d84a-c4c8-4eba-85b5-a52f47e628a7-c000.snappy.parquet"} -{"c1_maxValue":994,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 9sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":997.496,"c3_minValue":7.742,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00002-1c8226c2-f2a0-455d-aedd-c544003b0b3d-c000.snappy.parquet"} -{"c1_maxValue":999,"c1_minValue":1,"c1_num_nulls":0,"c2_maxValue":" 999sdc","c2_minValue":" 100sdc","c2_num_nulls":0,"c3_maxValue":980.676,"c3_minValue":0.120,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-03","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00002-5034d84a-c4c8-4eba-85b5-a52f47e628a7-c000.snappy.parquet"} -{"c1_maxValue":999,"c1_minValue":1,"c1_num_nulls":0,"c2_maxValue":" 99sdc","c2_minValue":" 10sdc","c2_num_nulls":0,"c3_maxValue":993.940,"c3_minValue":4.598,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-03","c6_num_nulls":0,"c7_maxValue":"/g==","c7_minValue":"AQ==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00003-1c8226c2-f2a0-455d-aedd-c544003b0b3d-c000.snappy.parquet"} -{"c1_maxValue":998,"c1_minValue":6,"c1_num_nulls":0,"c2_maxValue":" 99sdc","c2_minValue":" 111sdc","c2_num_nulls":0,"c3_maxValue":999.282,"c3_minValue":1.217,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":2,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00003-5034d84a-c4c8-4eba-85b5-a52f47e628a7-c000.snappy.parquet"} \ No newline at end of file +{"c1_maxValue":272,"c1_minValue":8,"c1_num_nulls":0,"c2_maxValue":" 8sdc","c2_minValue":" 129sdc","c2_num_nulls":0,"c3_maxValue":979.272,"c3_minValue":430.129,"c3_num_nulls":0,"c5_maxValue":28,"c5_minValue":2,"c5_num_nulls":0,"c6_maxValue":"2020-11-20","c6_minValue":"2020-03-23","c6_num_nulls":0,"c7_maxValue":"8A==","c7_minValue":"Ag==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00003-xxx-c000.snappy.parquet"} +{"c1_maxValue":486,"c1_minValue":59,"c1_num_nulls":0,"c2_maxValue":" 79sdc","c2_minValue":" 111sdc","c2_num_nulls":0,"c3_maxValue":771.590,"c3_minValue":82.111,"c3_num_nulls":0,"c5_maxValue":50,"c5_minValue":7,"c5_num_nulls":0,"c6_maxValue":"2020-11-21","c6_minValue":"2020-01-22","c6_num_nulls":0,"c7_maxValue":"5g==","c7_minValue":"Ow==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00002-xxx-c000.snappy.parquet"} +{"c1_maxValue":559,"c1_minValue":74,"c1_num_nulls":0,"c2_maxValue":" 74sdc","c2_minValue":" 181sdc","c2_num_nulls":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_num_nulls":0,"c5_maxValue":57,"c5_minValue":9,"c5_num_nulls":0,"c6_maxValue":"2020-11-09","c6_minValue":"2020-01-08","c6_num_nulls":0,"c7_maxValue":"1Q==","c7_minValue":"Gw==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00001-xxx-c000.snappy.parquet"} +{"c1_maxValue":639,"c1_minValue":323,"c1_num_nulls":0,"c2_maxValue":" 639sdc","c2_minValue":" 323sdc","c2_num_nulls":0,"c3_maxValue":811.638,"c3_minValue":100.556,"c3_num_nulls":0,"c5_maxValue":65,"c5_minValue":33,"c5_num_nulls":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-23","c6_num_nulls":0,"c7_maxValue":"fw==","c7_minValue":"Kw==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00000-xxx-c000.snappy.parquet"} +{"c1_maxValue":719,"c1_minValue":125,"c1_num_nulls":0,"c2_maxValue":" 719sdc","c2_minValue":" 125sdc","c2_num_nulls":0,"c3_maxValue":958.579,"c3_minValue":153.125,"c3_num_nulls":0,"c5_maxValue":73,"c5_minValue":14,"c5_num_nulls":0,"c6_maxValue":"2020-09-27","c6_minValue":"2020-01-16","c6_num_nulls":0,"c7_maxValue":"+g==","c7_minValue":"OA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00001-xxx-c000.snappy.parquet"} +{"c1_maxValue":770,"c1_minValue":300,"c1_num_nulls":0,"c2_maxValue":" 770sdc","c2_minValue":" 300sdc","c2_num_nulls":0,"c3_maxValue":977.328,"c3_minValue":64.768,"c3_num_nulls":0,"c5_maxValue":78,"c5_minValue":31,"c5_num_nulls":0,"c6_maxValue":"2020-10-21","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"rw==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00000-xxx-c000.snappy.parquet"} +{"c1_maxValue":945,"c1_minValue":355,"c1_num_nulls":0,"c2_maxValue":" 945sdc","c2_minValue":" 355sdc","c2_num_nulls":0,"c3_maxValue":994.355,"c3_minValue":374.882,"c3_num_nulls":0,"c5_maxValue":96,"c5_minValue":37,"c5_num_nulls":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-02-25","c6_num_nulls":0,"c7_maxValue":"sQ==","c7_minValue":"AQ==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00002-xxx-c000.snappy.parquet"} +{"c1_maxValue":959,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":916.697,"c3_minValue":19.000,"c3_num_nulls":0,"c5_maxValue":97,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"yA==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00003-xxx-c000.snappy.parquet"} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/z-index-table.json b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/z-index-table.json index 45cb9aaf88c22..a633e3170e108 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/z-index-table.json +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/index/zorder/z-index-table.json @@ -1,4 +1,4 @@ -{"c1_maxValue":1000,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 996sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":999.779,"c3_minValue":2.992,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"/g==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00000-5034d84a-c4c8-4eba-85b5-a52f47e628a7-c000.snappy.parquet"} -{"c1_maxValue":997,"c1_minValue":3,"c1_num_nulls":0,"c2_maxValue":" 9sdc","c2_minValue":" 102sdc","c2_num_nulls":0,"c3_maxValue":990.531,"c3_minValue":2.336,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-27","c6_minValue":"2020-01-02","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00001-5034d84a-c4c8-4eba-85b5-a52f47e628a7-c000.snappy.parquet"} -{"c1_maxValue":999,"c1_minValue":1,"c1_num_nulls":0,"c2_maxValue":" 999sdc","c2_minValue":" 100sdc","c2_num_nulls":0,"c3_maxValue":980.676,"c3_minValue":0.120,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-03","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00002-5034d84a-c4c8-4eba-85b5-a52f47e628a7-c000.snappy.parquet"} -{"c1_maxValue":998,"c1_minValue":6,"c1_num_nulls":0,"c2_maxValue":" 99sdc","c2_minValue":" 111sdc","c2_num_nulls":0,"c3_maxValue":999.282,"c3_minValue":1.217,"c3_num_nulls":0,"c5_maxValue":101,"c5_minValue":2,"c5_num_nulls":0,"c6_maxValue":"2020-11-28","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"/w==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00003-5034d84a-c4c8-4eba-85b5-a52f47e628a7-c000.snappy.parquet"} \ No newline at end of file +{"c1_maxValue":559,"c1_minValue":74,"c1_num_nulls":0,"c2_maxValue":" 74sdc","c2_minValue":" 181sdc","c2_num_nulls":0,"c3_maxValue":980.213,"c3_minValue":38.740,"c3_num_nulls":0,"c5_maxValue":57,"c5_minValue":9,"c5_num_nulls":0,"c6_maxValue":"2020-11-09","c6_minValue":"2020-01-08","c6_num_nulls":0,"c7_maxValue":"1Q==","c7_minValue":"Gw==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00001-xxx-c000.snappy.parquet"} +{"c1_maxValue":639,"c1_minValue":323,"c1_num_nulls":0,"c2_maxValue":" 639sdc","c2_minValue":" 323sdc","c2_num_nulls":0,"c3_maxValue":811.638,"c3_minValue":100.556,"c3_num_nulls":0,"c5_maxValue":65,"c5_minValue":33,"c5_num_nulls":0,"c6_maxValue":"2020-09-09","c6_minValue":"2020-01-23","c6_num_nulls":0,"c7_maxValue":"fw==","c7_minValue":"Kw==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00000-xxx-c000.snappy.parquet"} +{"c1_maxValue":945,"c1_minValue":355,"c1_num_nulls":0,"c2_maxValue":" 945sdc","c2_minValue":" 355sdc","c2_num_nulls":0,"c3_maxValue":994.355,"c3_minValue":374.882,"c3_num_nulls":0,"c5_maxValue":96,"c5_minValue":37,"c5_num_nulls":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-02-25","c6_num_nulls":0,"c7_maxValue":"sQ==","c7_minValue":"AQ==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00002-xxx-c000.snappy.parquet"} +{"c1_maxValue":959,"c1_minValue":0,"c1_num_nulls":0,"c2_maxValue":" 959sdc","c2_minValue":" 0sdc","c2_num_nulls":0,"c3_maxValue":916.697,"c3_minValue":19.000,"c3_num_nulls":0,"c5_maxValue":97,"c5_minValue":1,"c5_num_nulls":0,"c6_maxValue":"2020-11-22","c6_minValue":"2020-01-01","c6_num_nulls":0,"c7_maxValue":"yA==","c7_minValue":"AA==","c7_num_nulls":0,"c8_maxValue":9,"c8_minValue":9,"c8_num_nulls":0,"file":"part-00003-xxx-c000.snappy.parquet"} \ No newline at end of file diff --git a/hudi-spark-datasource/hudi-spark/src/test/resources/sql-statements.sql b/hudi-spark-datasource/hudi-spark/src/test/resources/sql-statements.sql index 135c83b4b975e..e19dd1eb6b8ba 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/resources/sql-statements.sql +++ b/hudi-spark-datasource/hudi-spark/src/test/resources/sql-statements.sql @@ -239,10 +239,6 @@ alter table h2_p add columns(ext0 int); +----------+ | ok | +----------+ -alter table h2_p change column ext0 ext0 bigint; -+----------+ -| ok | -+----------+ # DROP TABLE drop table h0; diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroConversionHelper.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroConversionHelper.scala index e29944529b51b..686d09ccf64fd 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroConversionHelper.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestAvroConversionHelper.scala @@ -18,13 +18,13 @@ package org.apache.hudi -import java.time.LocalDate - import org.apache.avro.Schema import org.apache.avro.generic.GenericData import org.apache.spark.sql.catalyst.expressions.GenericRow import org.scalatest.{FunSuite, Matchers} +import java.time.LocalDate + class TestAvroConversionHelper extends FunSuite with Matchers { val dateSchema = s""" @@ -42,7 +42,7 @@ class TestAvroConversionHelper extends FunSuite with Matchers { test("Logical type: date") { val schema = new Schema.Parser().parse(dateSchema) - val convertor = AvroConversionHelper.createConverterToRow(schema, AvroConversionUtils.convertAvroSchemaToStructType(schema)) + val convertor = AvroConversionUtils.createConverterToRow(schema, AvroConversionUtils.convertAvroSchemaToStructType(schema)) val dateOutputData = dateInputData.map(x => { val record = new GenericData.Record(schema) {{ put("date", x) }} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala index d3be8c9b3e209..7fc7d318d362f 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestDataSourceDefaults.scala @@ -243,7 +243,7 @@ class TestDataSourceDefaults { val partitionPathProp: String = props.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD.key) val STRUCT_NAME: String = "hoodieRowTopLevelField" val NAMESPACE: String = "hoodieRow" - var converterFn: Function1[Any, Any] = _ + var converterFn: Function1[Row, GenericRecord] = _ override def getKey(record: GenericRecord): HoodieKey = { new HoodieKey(HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyProp, true, false), @@ -251,13 +251,13 @@ class TestDataSourceDefaults { } override def getRecordKey(row: Row): String = { - if (null == converterFn) converterFn = AvroConversionHelper.createConverterToAvro(row.schema, STRUCT_NAME, NAMESPACE) + if (null == converterFn) converterFn = AvroConversionUtils.createConverterToAvro(row.schema, STRUCT_NAME, NAMESPACE) val genericRecord = converterFn.apply(row).asInstanceOf[GenericRecord] getKey(genericRecord).getRecordKey } override def getPartitionPath(row: Row): String = { - if (null == converterFn) converterFn = AvroConversionHelper.createConverterToAvro(row.schema, STRUCT_NAME, NAMESPACE) + if (null == converterFn) converterFn = AvroConversionUtils.createConverterToAvro(row.schema, STRUCT_NAME, NAMESPACE) val genericRecord = converterFn.apply(row).asInstanceOf[GenericRecord] getKey(genericRecord).getPartitionPath } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala index 4896ddf07fda2..fa07c573f2725 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala @@ -18,6 +18,7 @@ package org.apache.hudi import org.apache.hadoop.conf.Configuration + import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.client.HoodieJavaWriteClient import org.apache.hudi.client.common.HoodieJavaEngineContext @@ -26,27 +27,32 @@ import org.apache.hudi.common.engine.EngineType import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType} import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.view.HoodieTableFileSystemView -import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestUtils} import org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings +import org.apache.hudi.common.testutils.{HoodieTestDataGenerator, HoodieTestUtils} import org.apache.hudi.common.util.PartitionPathEncodeUtils import org.apache.hudi.common.util.StringUtils.isNullOrEmpty import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.keygen.ComplexKeyGenerator -import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.{Config, TimestampType} +import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.TimestampType +import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config import org.apache.hudi.testutils.HoodieClientTestBase + import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EqualTo, GreaterThanOrEqual, LessThan, Literal} import org.apache.spark.sql.execution.datasources.PartitionDirectory import org.apache.spark.sql.functions.{lit, struct} import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{DataFrameWriter, Row, SaveMode, SparkSession} + import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.{BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{Arguments, CsvSource, MethodSource, ValueSource} import java.util.Properties + + import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ @@ -213,8 +219,11 @@ class TestHoodieFileIndex extends HoodieClientTestBase { GreaterThanOrEqual(attribute("partition"), literal("2021/03/08")), LessThan(attribute("partition"), literal("2021/03/10")) ) - val prunedPartitions = fileIndex.listFiles(Seq(partitionFilter2), - Seq.empty).map(_.values.toSeq(Seq(StringType)).mkString(",")).toList + val prunedPartitions = fileIndex.listFiles(Seq(partitionFilter2), Seq.empty) + .map(_.values.toSeq(Seq(StringType)) + .mkString(",")) + .toList + .sorted assertEquals(List("2021/03/08", "2021/03/09"), prunedPartitions) } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala index bd520c91f4fa5..b5186fb1ac089 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkSqlWriter.scala @@ -32,13 +32,13 @@ import org.apache.hudi.functional.TestBootstrap import org.apache.hudi.hive.HiveSyncConfig import org.apache.hudi.keygen.{ComplexKeyGenerator, NonpartitionedKeyGenerator, SimpleKeyGenerator} import org.apache.hudi.testutils.DataSourceTestUtils -import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.functions.{expr, lit} import org.apache.spark.sql.hudi.HoodieSparkSessionExtension import org.apache.spark.sql.hudi.command.SqlKeyGenerator import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.{SparkConf, SparkContext} import org.junit.jupiter.api.Assertions.{assertEquals, assertFalse, assertTrue, fail} import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest @@ -94,11 +94,17 @@ class TestHoodieSparkSqlWriter { * Utility method for initializing the spark context. */ def initSparkContext(): Unit = { + val sparkConf = new SparkConf() + if (HoodieSparkUtils.gteqSpark3_2) { + sparkConf.set("spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.hudi.catalog.HoodieCatalog") + } spark = SparkSession.builder() .appName(hoodieFooTableName) .master("local[2]") .withExtensions(new HoodieSparkSessionExtension) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config(sparkConf) .getOrCreate() sc = spark.sparkContext sc.setLogLevel("ERROR") @@ -543,6 +549,12 @@ class TestHoodieSparkSqlWriter { // Verify that HoodieWriteClient is closed correctly verify(client, times(1)).close() + + val ignoreResult = HoodieSparkSqlWriter.bootstrap(sqlContext, SaveMode.Ignore, fooTableModifier, spark.emptyDataFrame, Option.empty, + Option(client)) + assertFalse(ignoreResult) + verify(client, times(2)).close() + // fetch all records from parquet files generated from write to hudi val actualDf = sqlContext.read.parquet(tempBasePath) assert(actualDf.count == 100) @@ -815,33 +827,32 @@ class TestHoodieSparkSqlWriter { /** * Test case for non partition table with metatable support. */ - @Test - def testNonPartitionTableWithMetatableSupport(): Unit = { - List(DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL).foreach { tableType => - val options = Map(DataSourceWriteOptions.TABLE_TYPE.key -> tableType, - DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "col3", - DataSourceWriteOptions.RECORDKEY_FIELD.key -> "keyid", - DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "", - DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.NonpartitionedKeyGenerator", - HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", - "hoodie.insert.shuffle.parallelism" -> "1", - "hoodie.metadata.enable" -> "true") - val df = spark.range(0, 10).toDF("keyid") - .withColumn("col3", expr("keyid")) - .withColumn("age", expr("keyid + 1000")) - df.write.format("hudi") - .options(options.updated(DataSourceWriteOptions.OPERATION.key, "insert")) - .mode(SaveMode.Overwrite).save(tempBasePath) - // upsert same record again - val df_update = spark.range(0, 10).toDF("keyid") - .withColumn("col3", expr("keyid")) - .withColumn("age", expr("keyid + 2000")) - df_update.write.format("hudi") - .options(options.updated(DataSourceWriteOptions.OPERATION.key, "upsert")) - .mode(SaveMode.Append).save(tempBasePath) - assert(spark.read.format("hudi").load(tempBasePath).count() == 10) - assert(spark.read.format("hudi").load(tempBasePath).where("age >= 2000").count() == 10) - } + @ParameterizedTest + @EnumSource(value = classOf[HoodieTableType]) + def testNonPartitionTableWithMetatableSupport(tableType: HoodieTableType): Unit = { + val options = Map(DataSourceWriteOptions.TABLE_TYPE.key -> tableType.name, + DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "col3", + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "keyid", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "", + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.NonpartitionedKeyGenerator", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.metadata.enable" -> "true") + val df = spark.range(0, 10).toDF("keyid") + .withColumn("col3", expr("keyid")) + .withColumn("age", expr("keyid + 1000")) + df.write.format("hudi") + .options(options.updated(DataSourceWriteOptions.OPERATION.key, "insert")) + .mode(SaveMode.Overwrite).save(tempBasePath) + // upsert same record again + val df_update = spark.range(0, 10).toDF("keyid") + .withColumn("col3", expr("keyid")) + .withColumn("age", expr("keyid + 2000")) + df_update.write.format("hudi") + .options(options.updated(DataSourceWriteOptions.OPERATION.key, "upsert")) + .mode(SaveMode.Append).save(tempBasePath) + assert(spark.read.format("hudi").load(tempBasePath).count() == 10) + assert(spark.read.format("hudi").load(tempBasePath).where("age >= 2000").count() == 10) } /** diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala index ad974286ac5a7..9f00b5dcdf64f 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieSparkUtils.scala @@ -18,11 +18,13 @@ package org.apache.hudi +import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import org.apache.hudi.exception.SchemaCompatibilityException import org.apache.hudi.testutils.DataSourceTestUtils -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{StructType, TimestampType} import org.apache.spark.sql.{Row, SparkSession} import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test @@ -185,7 +187,7 @@ class TestHoodieSparkUtils { val genRecRDD3 = HoodieSparkUtils.createRdd(df1, "test_struct_name", "test_namespace", true, org.apache.hudi.common.util.Option.of(schema2)) assert(genRecRDD3.collect()(0).getSchema.equals(schema2)) - genRecRDD3.foreach(entry => assertNull(entry.get("nonNullableInnerStruct2"))) + genRecRDD3.foreach(entry => assertNull(entry.get("nullableInnerStruct2"))) val innerStruct3 = new StructType().add("innerKey","string",false).add("innerValue", "long", true) .add("new_nested_col","string",true) @@ -226,12 +228,36 @@ class TestHoodieSparkUtils { fail("createRdd should fail, because records don't have a column which is not nullable in the passed in schema") } catch { case e: Exception => - e.getCause.asInstanceOf[NullPointerException] - assertTrue(e.getMessage.contains("null of string in field new_nested_col of")) + val cause = e.getCause + assertTrue(cause.isInstanceOf[SchemaCompatibilityException]) + assertTrue(e.getMessage.contains("Unable to validate the rewritten record {\"innerKey\": \"innerKey1_2\", \"innerValue\": 2} against schema")) } spark.stop() } + @Test + def testGetRequiredSchema(): Unit = { + val avroSchemaString = "{\"type\":\"record\",\"name\":\"record\"," + + "\"fields\":[{\"name\":\"_hoodie_commit_time\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + + "{\"name\":\"_hoodie_commit_seqno\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + + "{\"name\":\"_hoodie_record_key\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + + "{\"name\":\"_hoodie_partition_path\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + + "{\"name\":\"_hoodie_file_name\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + + "{\"name\":\"uuid\",\"type\":\"string\"},{\"name\":\"name\",\"type\":[\"null\",\"string\"],\"default\":null}," + + "{\"name\":\"age\",\"type\":[\"null\",\"int\"],\"default\":null}," + + "{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"}],\"default\":null}," + + "{\"name\":\"partition\",\"type\":[\"null\",\"string\"],\"default\":null}]}" + + val tableAvroSchema = new Schema.Parser().parse(avroSchemaString) + + val (requiredAvroSchema, requiredStructSchema) = + HoodieSparkUtils.getRequiredSchema(tableAvroSchema, Array("ts")) + + assertEquals("timestamp-millis", + requiredAvroSchema.getField("ts").schema().getTypes.get(1).getLogicalType.getName) + assertEquals(TimestampType, requiredStructSchema.fields(0).dataType) + } + def convertRowListToSeq(inputList: java.util.List[Row]): Seq[Row] = JavaConverters.asScalaIteratorConverter(inputList.iterator).asScala.toSeq } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestMergeOnReadSnapshotRelation.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestMergeOnReadSnapshotRelation.scala deleted file mode 100644 index 80a883a001d98..0000000000000 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestMergeOnReadSnapshotRelation.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi - -import org.apache.avro.Schema -import org.apache.spark.sql.types.TimestampType -import org.junit.jupiter.api.Assertions.assertEquals -import org.junit.jupiter.api.Test - -class TestMergeOnReadSnapshotRelation { - - @Test - def testGetRequiredSchema(): Unit = { - val avroSchemaString = "{\"type\":\"record\",\"name\":\"record\"," + - "\"fields\":[{\"name\":\"_hoodie_commit_time\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + - "{\"name\":\"_hoodie_commit_seqno\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + - "{\"name\":\"_hoodie_record_key\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + - "{\"name\":\"_hoodie_partition_path\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + - "{\"name\":\"_hoodie_file_name\",\"type\":[\"null\",\"string\"],\"doc\":\"\",\"default\":null}," + - "{\"name\":\"uuid\",\"type\":\"string\"},{\"name\":\"name\",\"type\":[\"null\",\"string\"],\"default\":null}," + - "{\"name\":\"age\",\"type\":[\"null\",\"int\"],\"default\":null}," + - "{\"name\":\"ts\",\"type\":[\"null\",{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"}],\"default\":null}," + - "{\"name\":\"partition\",\"type\":[\"null\",\"string\"],\"default\":null}]}" - - val tableAvroSchema = new Schema.Parser().parse(avroSchemaString) - - val (requiredAvroSchema, requiredStructSchema) = - MergeOnReadSnapshotRelation.getRequiredSchema(tableAvroSchema, Array("ts")) - - assertEquals("timestamp-millis", - requiredAvroSchema.getField("ts").schema().getTypes.get(1).getLogicalType.getName) - assertEquals(TimestampType, requiredStructSchema.fields(0).dataType) - } -} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestTableSchemaResolverWithSparkSQL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestTableSchemaResolverWithSparkSQL.scala new file mode 100644 index 0000000000000..85e1925bc1655 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestTableSchemaResolverWithSparkSQL.scala @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi + +import org.apache.avro.Schema +import org.apache.commons.io.FileUtils +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.avro.HoodieAvroUtils +import org.apache.hudi.avro.model.HoodieMetadataRecord +import org.apache.hudi.common.model._ +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.testutils.DataSourceTestUtils +import org.apache.spark.SparkContext +import org.apache.spark.sql._ +import org.apache.spark.sql.hudi.HoodieSparkSessionExtension +import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} +import org.junit.jupiter.api.{AfterEach, BeforeEach, Tag, Test} +import org.junit.jupiter.params.ParameterizedTest +import org.junit.jupiter.params.provider.CsvSource + +import scala.collection.JavaConverters + +/** + * Test suite for TableSchemaResolver with SparkSqlWriter. + */ +@Tag("functional") +class TestTableSchemaResolverWithSparkSQL { + var spark: SparkSession = _ + var sqlContext: SQLContext = _ + var sc: SparkContext = _ + var tempPath: java.nio.file.Path = _ + var tempBootStrapPath: java.nio.file.Path = _ + var hoodieFooTableName = "hoodie_foo_tbl" + var tempBasePath: String = _ + var commonTableModifier: Map[String, String] = Map() + + case class StringLongTest(uuid: String, ts: Long) + + /** + * Setup method running before each test. + */ + @BeforeEach + def setUp(): Unit = { + initSparkContext() + tempPath = java.nio.file.Files.createTempDirectory("hoodie_test_path") + tempBootStrapPath = java.nio.file.Files.createTempDirectory("hoodie_test_bootstrap") + tempBasePath = tempPath.toAbsolutePath.toString + commonTableModifier = getCommonParams(tempPath, hoodieFooTableName, HoodieTableType.COPY_ON_WRITE.name()) + } + + /** + * Tear down method running after each test. + */ + @AfterEach + def tearDown(): Unit = { + cleanupSparkContexts() + FileUtils.deleteDirectory(tempPath.toFile) + FileUtils.deleteDirectory(tempBootStrapPath.toFile) + } + + /** + * Utility method for initializing the spark context. + */ + def initSparkContext(): Unit = { + spark = SparkSession.builder() + .appName(hoodieFooTableName) + .master("local[2]") + .withExtensions(new HoodieSparkSessionExtension) + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .getOrCreate() + sc = spark.sparkContext + sc.setLogLevel("ERROR") + sqlContext = spark.sqlContext + } + + /** + * Utility method for cleaning up spark resources. + */ + def cleanupSparkContexts(): Unit = { + if (sqlContext != null) { + sqlContext.clearCache(); + sqlContext = null; + } + if (sc != null) { + sc.stop() + sc = null + } + if (spark != null) { + spark.close() + } + } + + /** + * Utility method for creating common params for writer. + * + * @param path Path for hoodie table + * @param hoodieFooTableName Name of hoodie table + * @param tableType Type of table + * @return Map of common params + */ + def getCommonParams(path: java.nio.file.Path, hoodieFooTableName: String, tableType: String): Map[String, String] = { + Map("path" -> path.toAbsolutePath.toString, + HoodieWriteConfig.TBL_NAME.key -> hoodieFooTableName, + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + DataSourceWriteOptions.TABLE_TYPE.key -> tableType, + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.SimpleKeyGenerator") + } + + /** + * Utility method for converting list of Row to list of Seq. + * + * @param inputList list of Row + * @return list of Seq + */ + def convertRowListToSeq(inputList: java.util.List[Row]): Seq[Row] = + JavaConverters.asScalaIteratorConverter(inputList.iterator).asScala.toSeq + + @Test + def testTableSchemaResolverInMetadataTable(): Unit = { + val schema = DataSourceTestUtils.getStructTypeExampleSchema + //create a new table + val tableName = hoodieFooTableName + val fooTableModifier = Map("path" -> tempPath.toAbsolutePath.toString, + HoodieWriteConfig.TBL_NAME.key -> tableName, + "hoodie.avro.schema" -> schema.toString(), + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.SimpleKeyGenerator", + "hoodie.metadata.compact.max.delta.commits" -> "2", + HoodieWriteConfig.ALLOW_OPERATION_METADATA_FIELD.key -> "true" + ) + + // generate the inserts + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val records = DataSourceTestUtils.generateRandomRows(10) + val recordsSeq = convertRowListToSeq(records) + val df1 = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Overwrite, fooTableModifier, df1) + + // do update + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableModifier, df1) + + val metadataTablePath = tempPath.toAbsolutePath.toString + "/.hoodie/metadata" + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(metadataTablePath) + .setConf(spark.sessionState.newHadoopConf()) + .build() + + // Delete latest metadata table deltacommit + // Get schema from metadata table hfile format base file. + val latestInstant = metaClient.getActiveTimeline.getCommitsTimeline.getReverseOrderedInstants.findFirst() + val path = new Path(metadataTablePath + "/.hoodie", latestInstant.get().getFileName) + val fs = path.getFileSystem(new Configuration()) + fs.delete(path, false) + schemaValuationBasedOnDataFile(metaClient, HoodieMetadataRecord.getClassSchema.toString()) + } + + @ParameterizedTest + @CsvSource(Array("COPY_ON_WRITE,parquet", "COPY_ON_WRITE,orc", "COPY_ON_WRITE,hfile", + "MERGE_ON_READ,parquet", "MERGE_ON_READ,orc", "MERGE_ON_READ,hfile")) + def testTableSchemaResolver(tableType: String, baseFileFormat: String): Unit = { + val schema = DataSourceTestUtils.getStructTypeExampleSchema + + //create a new table + val tableName = hoodieFooTableName + val fooTableModifier = Map("path" -> tempPath.toAbsolutePath.toString, + HoodieWriteConfig.BASE_FILE_FORMAT.key -> baseFileFormat, + DataSourceWriteOptions.TABLE_TYPE.key -> tableType, + HoodieWriteConfig.TBL_NAME.key -> tableName, + "hoodie.avro.schema" -> schema.toString(), + "hoodie.insert.shuffle.parallelism" -> "1", + "hoodie.upsert.shuffle.parallelism" -> "1", + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.SimpleKeyGenerator", + HoodieWriteConfig.ALLOW_OPERATION_METADATA_FIELD.key -> "true" + ) + + // generate the inserts + val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema) + val records = DataSourceTestUtils.generateRandomRows(10) + val recordsSeq = convertRowListToSeq(records) + val df1 = spark.createDataFrame(sc.parallelize(recordsSeq), structType) + HoodieSparkSqlWriter.write(sqlContext, SaveMode.Overwrite, fooTableModifier, df1) + + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(tempPath.toAbsolutePath.toString) + .setConf(spark.sessionState.newHadoopConf()) + .build() + + assertTrue(new TableSchemaResolver(metaClient).isHasOperationField) + schemaValuationBasedOnDataFile(metaClient, schema.toString()) + } + + /** + * Test and valuate schema read from data file --> getTableAvroSchemaFromDataFile + * @param metaClient + * @param schemaString + */ + def schemaValuationBasedOnDataFile(metaClient: HoodieTableMetaClient, schemaString: String): Unit = { + metaClient.reloadActiveTimeline() + var tableSchemaResolverParsingException: Exception = null + try { + val schemaFromData = new TableSchemaResolver(metaClient).getTableAvroSchemaFromDataFile + val structFromData = AvroConversionUtils.convertAvroSchemaToStructType(HoodieAvroUtils.removeMetadataFields(schemaFromData)) + val schemeDesign = new Schema.Parser().parse(schemaString) + val structDesign = AvroConversionUtils.convertAvroSchemaToStructType(schemeDesign) + assertEquals(structFromData, structDesign) + } catch { + case e: Exception => tableSchemaResolverParsingException = e; + } + assert(tableSchemaResolverParsingException == null) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala index e8b179804dfca..96d50f6b57b80 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala @@ -19,22 +19,24 @@ package org.apache.hudi.functional import org.apache.hadoop.fs.FileSystem import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.table.timeline.HoodieInstant import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.RawTripTestPayload.{deleteRecordsToStrings, recordsToStrings} import org.apache.hudi.config.HoodieWriteConfig -import org.apache.hudi.exception.HoodieUpsertException -import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.Config +import org.apache.hudi.exception.{HoodieException, HoodieUpsertException} import org.apache.hudi.keygen._ +import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config import org.apache.hudi.testutils.HoodieClientTestBase -import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers} +import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, HoodieMergeOnReadRDD} import org.apache.spark.sql._ import org.apache.spark.sql.functions.{col, concat, lit, udf} import org.apache.spark.sql.types._ import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat -import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue, fail} +import org.junit.jupiter.api.Assertions.{assertEquals, assertThrows, assertTrue, fail} +import org.junit.jupiter.api.function.Executable import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.{CsvSource, ValueSource} @@ -57,7 +59,8 @@ class TestCOWDataSource extends HoodieClientTestBase { DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp", - HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test", + HoodieMetadataConfig.COMPACT_NUM_DELTA_COMMITS.key -> "1" ) val verificationCol: String = "driver" @@ -92,6 +95,79 @@ class TestCOWDataSource extends HoodieClientTestBase { assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000")) } + + @Test def testHoodieIsDeletedNonBooleanField() { + // Insert Operation + val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList + val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) + val df = inputDF.withColumn(HoodieRecord.HOODIE_IS_DELETED, lit("abc")) + + assertThrows(classOf[HoodieException], new Executable { + override def execute(): Unit = { + df.write.format("hudi") + .options(commonOpts) + .mode(SaveMode.Overwrite) + .save(basePath) + } + }, "Should have failed since _hoodie_is_deleted is not a BOOLEAN data type") + } + + /** + * This tests the case that query by with a specified partition condition on hudi table which is + * different between the value of the partition field and the actual partition path, + * like hudi table written by TimestampBasedKeyGenerator. + * + * For COW table, test the snapshot query mode and incremental query mode. + */ + @Test + def testPrunePartitionForTimestampBasedKeyGenerator(): Unit = { + val options = commonOpts ++ Map( + "hoodie.compact.inline" -> "false", + DataSourceWriteOptions.TABLE_TYPE.key -> DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.TimestampBasedKeyGenerator", + Config.TIMESTAMP_TYPE_FIELD_PROP -> "DATE_STRING", + Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP -> "yyyy/MM/dd", + Config.TIMESTAMP_TIMEZONE_FORMAT_PROP -> "GMT+8:00", + Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP -> "yyyy-MM-dd" + ) + + val dataGen1 = new HoodieTestDataGenerator(Array("2022-01-01")) + val records1 = recordsToStrings(dataGen1.generateInserts("001", 20)).toList + val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) + inputDF1.write.format("org.apache.hudi") + .options(options) + .mode(SaveMode.Overwrite) + .save(basePath) + metaClient = HoodieTableMetaClient.builder() + .setBasePath(basePath) + .setConf(spark.sessionState.newHadoopConf) + .build() + val commit1Time = metaClient.getActiveTimeline.lastInstant().get().getTimestamp + + val dataGen2 = new HoodieTestDataGenerator(Array("2022-01-02")) + val records2 = recordsToStrings(dataGen2.generateInserts("002", 30)).toList + val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) + inputDF2.write.format("org.apache.hudi") + .options(options) + .mode(SaveMode.Append) + .save(basePath) + val commit2Time = metaClient.reloadActiveTimeline.lastInstant().get().getTimestamp + + // snapshot query + val snapshotQueryRes = spark.read.format("hudi").load(basePath) + assertEquals(snapshotQueryRes.where("partition = '2022-01-01'").count, 20) + assertEquals(snapshotQueryRes.where("partition = '2022-01-02'").count, 30) + + // incremental query + val incrementalQueryRes = spark.read.format("hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit1Time) + .option(DataSourceReadOptions.END_INSTANTTIME.key, commit2Time) + .load(basePath) + assertEquals(incrementalQueryRes.where("partition = '2022-01-01'").count, 0) + assertEquals(incrementalQueryRes.where("partition = '2022-01-02'").count, 30) + } + /** * Test for https://issues.apache.org/jira/browse/HUDI-1615. Null Schema in BulkInsert row writer flow. * This was reported by customer when archival kicks in as the schema in commit metadata is not set for bulk_insert @@ -156,7 +232,7 @@ class TestCOWDataSource extends HoodieClientTestBase { val snapshotDF2 = spark.read.format("org.apache.hudi") .load(basePath + "/*/*/*/*") - assertEquals(snapshotDF1.count() - inputDF2.count(), snapshotDF2.count()) + assertEquals(snapshotDF2.count(), 80) } @Test def testOverWriteModeUseReplaceAction(): Unit = { @@ -404,15 +480,10 @@ class TestCOWDataSource extends HoodieClientTestBase { } private def getDataFrameWriter(keyGenerator: String): DataFrameWriter[Row] = { - getDataFrameWriter(keyGenerator, true) - } - - private def getDataFrameWriter(keyGenerator: String, enableMetadata: Boolean): DataFrameWriter[Row] = { val records = recordsToStrings(dataGen.generateInserts("000", 100)).toList val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) - val opts = commonOpts ++ Map(HoodieMetadataConfig.ENABLE.key() -> String.valueOf(enableMetadata)) inputDF.write.format("hudi") - .options(opts) + .options(commonOpts) .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, keyGenerator) .mode(SaveMode.Overwrite) } @@ -440,7 +511,7 @@ class TestCOWDataSource extends HoodieClientTestBase { assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= udf_date_format(col("current_ts"))).count() == 0) // Mixed fieldType - writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, false) + writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) writer.partitionBy("driver", "rider:SIMPLE", "current_ts:TIMESTAMP") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") @@ -452,7 +523,7 @@ class TestCOWDataSource extends HoodieClientTestBase { concat(col("driver"), lit("/"), col("rider"), lit("/"), udf_date_format(col("current_ts")))).count() == 0) // Test invalid partitionKeyType - writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, false) + writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName) writer = writer.partitionBy("current_ts:DUMMY") .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS") .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd") @@ -708,6 +779,89 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals(numRecords - numRecordsToDelete, snapshotDF2.count()) } + @Test def testFailEarlyForIncrViewQueryForNonExistingFiles(): Unit = { + // Create 10 commits + for (i <- 1 to 10) { + val records = recordsToStrings(dataGen.generateInserts("%05d".format(i), 100)).toList + val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2)) + inputDF.write.format("org.apache.hudi") + .options(commonOpts) + .option("hoodie.cleaner.commits.retained", "3") + .option("hoodie.keep.min.commits", "4") + .option("hoodie.keep.max.commits", "5") + .option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .mode(SaveMode.Append) + .save(basePath) + } + + val hoodieMetaClient = HoodieTableMetaClient.builder().setConf(spark.sparkContext.hadoopConfiguration).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build() + /** + * State of timeline after 10 commits + * +------------------+--------------------------------------+ + * | Archived | Active Timeline | + * +------------------+--------------+-----------------------+ + * | C0 C1 C2 C3 | C4 C5 | C6 C7 C8 C9 | + * +------------------+--------------+-----------------------+ + * | Data cleaned | Data exists in table | + * +---------------------------------+-----------------------+ + */ + + val completedCommits = hoodieMetaClient.getCommitsTimeline.filterCompletedInstants() // C4 to C9 + //Anything less than 2 is a valid commit in the sense no cleanup has been done for those commit files + var startTs = completedCommits.nthInstant(0).get().getTimestamp //C4 + var endTs = completedCommits.nthInstant(1).get().getTimestamp //C5 + + //Calling without the fallback should result in Path does not exist + var hoodieIncViewDF = spark.read.format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key(), startTs) + .option(DataSourceReadOptions.END_INSTANTTIME.key(), endTs) + .load(basePath) + + val msg = "Should fail with Path does not exist" + assertThrows(classOf[AnalysisException], new Executable { + override def execute(): Unit = { + hoodieIncViewDF.count() + } + }, msg) + + //Should work with fallback enabled + hoodieIncViewDF = spark.read.format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key(), startTs) + .option(DataSourceReadOptions.END_INSTANTTIME.key(), endTs) + .option(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.key(), "true") + .load(basePath) + assertEquals(100, hoodieIncViewDF.count()) + + //Test out for archived commits + val archivedInstants = hoodieMetaClient.getArchivedTimeline.getInstants.distinct().toArray + startTs = archivedInstants(0).asInstanceOf[HoodieInstant].getTimestamp //C0 + endTs = completedCommits.nthInstant(1).get().getTimestamp //C5 + + //Calling without the fallback should result in Path does not exist + hoodieIncViewDF = spark.read.format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key(), startTs) + .option(DataSourceReadOptions.END_INSTANTTIME.key(), endTs) + .load(basePath) + + assertThrows(classOf[AnalysisException], new Executable { + override def execute(): Unit = { + hoodieIncViewDF.count() + } + }, msg) + + //Should work with fallback enabled + hoodieIncViewDF = spark.read.format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key(), startTs) + .option(DataSourceReadOptions.END_INSTANTTIME.key(), endTs) + .option(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.key(), "true") + .load(basePath) + assertEquals(500, hoodieIncViewDF.count()) + } + def copyOnWriteTableSelect(enableDropPartitionColumns: Boolean): Boolean = { val records1 = recordsToStrings(dataGen.generateInsertsContainsAllPartitions("000", 3)).toList val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) @@ -748,4 +902,48 @@ class TestCOWDataSource extends HoodieClientTestBase { assertEquals(inputDF2.sort("_row_key").select("shortDecimal").collect().map(_.getDecimal(0).toPlainString).mkString(","), readResult.sort("_row_key").select("shortDecimal").collect().map(_.getDecimal(0).toPlainString).mkString(",")) } + + @Test + def testHoodieBaseFileOnlyViewRelation(): Unit = { + val _spark = spark + import _spark.implicits._ + + val df = Seq((1, "z3", 30, "v1", "2018-09-23"), (2, "z3", 35, "v1", "2018-09-24")) + .toDF("id", "name", "age", "ts", "data_date") + + df.write.format("hudi") + .options(commonOpts) + .option(DataSourceWriteOptions.RECORDKEY_FIELD.key, "id") + .option(DataSourceWriteOptions.RECORDKEY_FIELD.key, "id") + .option(DataSourceWriteOptions.RECORDKEY_FIELD.key, "id") + .option("hoodie.insert.shuffle.parallelism", "4") + .option("hoodie.upsert.shuffle.parallelism", "4") + .option("hoodie.bulkinsert.shuffle.parallelism", "2") + .option(DataSourceWriteOptions.RECORDKEY_FIELD.key, "id") + .option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "data_date") + .option(DataSourceWriteOptions.PRECOMBINE_FIELD.key, "ts") + .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key, "org.apache.hudi.keygen.TimestampBasedKeyGenerator") + .option(Config.TIMESTAMP_TYPE_FIELD_PROP, "DATE_STRING") + .option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyy/MM/dd") + .option(Config.TIMESTAMP_TIMEZONE_FORMAT_PROP, "GMT+8:00") + .option(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "yyyy-MM-dd") + .mode(org.apache.spark.sql.SaveMode.Append) + .save(basePath) + + val res = spark.read.format("hudi").load(basePath) + + assert(res.count() == 2) + + // data_date is the partition field. Persist to the parquet file using the origin values, and read it. + assertTrue( + res.select("data_date").map(_.get(0).toString).collect().sorted.sameElements( + Array("2018-09-23", "2018-09-24") + ) + ) + assertTrue( + res.select("_hoodie_partition_path").map(_.get(0).toString).collect().sorted.sameElements( + Array("2018/09/23", "2018/09/24") + ) + ) + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSourceStorage.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSourceStorage.scala index bf616e2cb314a..e7daf08d1193c 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSourceStorage.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSourceStorage.scala @@ -26,7 +26,7 @@ import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline} import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings import org.apache.hudi.config.HoodieWriteConfig -import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator.Config +import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config import org.apache.hudi.keygen.{ComplexKeyGenerator, TimestampBasedKeyGenerator} import org.apache.hudi.testutils.SparkClientFunctionalTestHarness import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers} @@ -58,9 +58,14 @@ class TestCOWDataSourceStorage extends SparkClientFunctionalTestHarness { val updatedVerificationVal: String = "driver_update" @ParameterizedTest - @CsvSource(Array("true,org.apache.hudi.keygen.SimpleKeyGenerator", "true,org.apache.hudi.keygen.ComplexKeyGenerator", - "true,org.apache.hudi.keygen.TimestampBasedKeyGenerator", "false,org.apache.hudi.keygen.SimpleKeyGenerator", - "false,org.apache.hudi.keygen.ComplexKeyGenerator", "false,org.apache.hudi.keygen.TimestampBasedKeyGenerator")) + @CsvSource(Array( + "true,org.apache.hudi.keygen.SimpleKeyGenerator", + "true,org.apache.hudi.keygen.ComplexKeyGenerator", + "true,org.apache.hudi.keygen.TimestampBasedKeyGenerator", + "false,org.apache.hudi.keygen.SimpleKeyGenerator", + "false,org.apache.hudi.keygen.ComplexKeyGenerator", + "false,org.apache.hudi.keygen.TimestampBasedKeyGenerator" + )) def testCopyOnWriteStorage(isMetadataEnabled: Boolean, keyGenClass: String): Unit = { commonOpts += DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key() -> keyGenClass if (classOf[ComplexKeyGenerator].getName.equals(keyGenClass)) { @@ -72,7 +77,7 @@ class TestCOWDataSourceStorage extends SparkClientFunctionalTestHarness { commonOpts += Config.TIMESTAMP_TYPE_FIELD_PROP -> "EPOCHMILLISECONDS" commonOpts += Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP -> "yyyyMMdd" } - val dataGen = new HoodieTestDataGenerator() + val dataGen = new HoodieTestDataGenerator(0xDEED) val fs = FSUtils.getFs(basePath, spark.sparkContext.hadoopConfiguration) // Insert Operation val records0 = recordsToStrings(dataGen.generateInserts("000", 100)).toList @@ -99,9 +104,13 @@ class TestCOWDataSourceStorage extends SparkClientFunctionalTestHarness { var updateDf: DataFrame = null if (classOf[TimestampBasedKeyGenerator].getName.equals(keyGenClass)) { // update current_ts to be same as original record so that partition path does not change with timestamp based key gen - val orignalRow = inputDF1.filter(col("_row_key") === verificationRowKey).collectAsList().get(0) - updateDf = snapshotDF1.filter(col("_row_key") === verificationRowKey).withColumn(verificationCol, lit(updatedVerificationVal)) - .withColumn("current_ts", lit(orignalRow.getAs("current_ts"))) + val originalRow = snapshotDF1.filter(col("_row_key") === verificationRowKey).collectAsList().get(0) + updateDf = inputDF1.filter(col("_row_key") === verificationRowKey) + .withColumn(verificationCol, lit(updatedVerificationVal)) + .withColumn("current_ts", lit(originalRow.getAs[Long]("current_ts"))) + .limit(1) + val updatedRow = updateDf.collectAsList().get(0) + assertEquals(originalRow.getAs[Long]("current_ts"), updatedRow.getAs[Long]("current_ts")); } else { updateDf = snapshotDF1.filter(col("_row_key") === verificationRowKey).withColumn(verificationCol, lit(updatedVerificationVal)) } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala index e79067041fb62..ae41fa8eb551f 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala @@ -18,16 +18,22 @@ package org.apache.hudi.functional -import org.apache.hadoop.fs.{LocatedFileStatus, Path} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} +import org.apache.hudi.common.util.ParquetUtils import org.apache.hudi.index.columnstats.ColumnStatsIndexHelper import org.apache.hudi.testutils.HoodieClientTestBase +import org.apache.spark.sql._ +import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.junit.jupiter.api.Assertions.assertEquals -import org.junit.jupiter.api.{AfterEach, BeforeEach, Disabled, Test} +import org.junit.jupiter.api.Assertions.{assertEquals, assertNotNull, assertTrue} +import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} -import scala.collection.JavaConversions._ +import java.math.BigInteger +import java.sql.{Date, Timestamp} +import scala.collection.JavaConverters._ +import scala.util.Random class TestColumnStatsIndex extends HoodieClientTestBase { var spark: SparkSession = _ @@ -58,15 +64,17 @@ class TestColumnStatsIndex extends HoodieClientTestBase { } @Test - @Disabled - def testColumnStatsTableComposition(): Unit = { + def testZIndexTableComposition(): Unit = { + val targetParquetTablePath = tempDir.resolve("index/zorder/input-table").toAbsolutePath.toString + val sourceJSONTablePath = getClass.getClassLoader.getResource("index/zorder/input-table-json").toString + + bootstrapParquetInputTableFromJSON(sourceJSONTablePath, targetParquetTablePath) + val inputDf = // NOTE: Schema here is provided for validation that the input date is in the appropriate format spark.read .schema(sourceTableSchema) - .parquet( - getClass.getClassLoader.getResource("index/zorder/input-table").toString - ) + .parquet(targetParquetTablePath) val zorderedCols = Seq("c1", "c2", "c3", "c5", "c6", "c7", "c8") val zorderedColsSchemaFields = inputDf.schema.fields.filter(f => zorderedCols.contains(f.name)).toSeq @@ -75,22 +83,18 @@ class TestColumnStatsIndex extends HoodieClientTestBase { val newZIndexTableDf = ColumnStatsIndexHelper.buildColumnStatsTableFor( inputDf.sparkSession, - inputDf.inputFiles.toSeq, - zorderedColsSchemaFields + inputDf.inputFiles.toSeq.asJava, + zorderedColsSchemaFields.asJava ) val indexSchema = ColumnStatsIndexHelper.composeIndexSchema( - sourceTableSchema.fields.filter(f => zorderedCols.contains(f.name)).toSeq + sourceTableSchema.fields.filter(f => zorderedCols.contains(f.name)).toSeq.asJava ) // Collect Z-index stats manually (reading individual Parquet files) val manualZIndexTableDf = - buildColumnStatsTableManually( - getClass.getClassLoader.getResource("index/zorder/input-table").toString, - zorderedCols, - indexSchema - ) + buildColumnStatsTableManually(targetParquetTablePath, zorderedCols, indexSchema) // NOTE: Z-index is built against stats collected w/in Parquet footers, which will be // represented w/ corresponding Parquet schema (INT, INT64, INT96, etc). @@ -107,18 +111,23 @@ class TestColumnStatsIndex extends HoodieClientTestBase { .schema(indexSchema) .json(getClass.getClassLoader.getResource("index/zorder/z-index-table.json").toString) - assertEquals(asJson(sort(expectedZIndexTableDf)), asJson(sort(newZIndexTableDf))) + assertEquals(asJson(sort(expectedZIndexTableDf)), asJson(sort(replace(newZIndexTableDf)))) } @Test - @Disabled - def testColumnStatsTableMerge(): Unit = { + def testZIndexTableMerge(): Unit = { val testZIndexPath = new Path(basePath, "zindex") + val firstParquetTablePath = tempDir.resolve("index/zorder/input-table").toAbsolutePath.toString + val firstJSONTablePath = getClass.getClassLoader.getResource("index/zorder/input-table-json").toString + + // Bootstrap FIRST source Parquet table + bootstrapParquetInputTableFromJSON(firstJSONTablePath, firstParquetTablePath) + val zorderedCols = Seq("c1", "c2", "c3", "c5", "c6", "c7", "c8") val indexSchema = ColumnStatsIndexHelper.composeIndexSchema( - sourceTableSchema.fields.filter(f => zorderedCols.contains(f.name)).toSeq + sourceTableSchema.fields.filter(f => zorderedCols.contains(f.name)).toSeq.asJava ) // @@ -126,19 +135,16 @@ class TestColumnStatsIndex extends HoodieClientTestBase { // val firstCommitInstance = "0" - val firstInputDf = - spark.read.parquet( - getClass.getClassLoader.getResource("index/zorder/input-table").toString - ) + val firstInputDf = spark.read.parquet(firstParquetTablePath) ColumnStatsIndexHelper.updateColumnStatsIndexFor( firstInputDf.sparkSession, sourceTableSchema, - firstInputDf.inputFiles.toSeq, - zorderedCols.toSeq, + firstInputDf.inputFiles.toSeq.asJava, + zorderedCols.asJava, testZIndexPath.toString, firstCommitInstance, - Seq() + Seq().asJava ) // NOTE: We don't need to provide schema upon reading from Parquet, since Spark will be able @@ -152,15 +158,19 @@ class TestColumnStatsIndex extends HoodieClientTestBase { .schema(indexSchema) .json(getClass.getClassLoader.getResource("index/zorder/z-index-table.json").toString) - assertEquals(asJson(sort(expectedInitialZIndexTableDf)), asJson(sort(initialZIndexTable))) + assertEquals(asJson(sort(expectedInitialZIndexTableDf)), asJson(sort(replace(initialZIndexTable)))) + + // Bootstrap SECOND source Parquet table + val secondParquetTablePath = tempDir.resolve("index/zorder/another-input-table").toAbsolutePath.toString + val secondJSONTablePath = getClass.getClassLoader.getResource("index/zorder/another-input-table-json").toString + + bootstrapParquetInputTableFromJSON(secondJSONTablePath, secondParquetTablePath) val secondCommitInstance = "1" val secondInputDf = spark.read .schema(sourceTableSchema) - .parquet( - getClass.getClassLoader.getResource("index/zorder/another-input-table").toString - ) + .parquet(secondParquetTablePath) // // Update Z-index table @@ -169,11 +179,11 @@ class TestColumnStatsIndex extends HoodieClientTestBase { ColumnStatsIndexHelper.updateColumnStatsIndexFor( secondInputDf.sparkSession, sourceTableSchema, - secondInputDf.inputFiles.toSeq, - zorderedCols.toSeq, + secondInputDf.inputFiles.toSeq.asJava, + zorderedCols.asJava, testZIndexPath.toString, secondCommitInstance, - Seq(firstCommitInstance) + Seq(firstCommitInstance).asJava ) // NOTE: We don't need to provide schema upon reading from Parquet, since Spark will be able @@ -187,56 +197,96 @@ class TestColumnStatsIndex extends HoodieClientTestBase { .schema(indexSchema) .json(getClass.getClassLoader.getResource("index/zorder/z-index-table-merged.json").toString) - assertEquals(asJson(sort(expectedMergedZIndexTableDf)), asJson(sort(mergedZIndexTable))) + assertEquals(asJson(sort(expectedMergedZIndexTableDf)), asJson(sort(replace(mergedZIndexTable)))) } @Test - @Disabled def testColumnStatsTablesGarbageCollection(): Unit = { - val testZIndexPath = new Path(System.getProperty("java.io.tmpdir"), "zindex") - val fs = testZIndexPath.getFileSystem(spark.sparkContext.hadoopConfiguration) + val targetParquetTablePath = tempDir.resolve("index/zorder/input-table").toAbsolutePath.toString + val sourceJSONTablePath = getClass.getClassLoader.getResource("index/zorder/input-table-json").toString - val inputDf = - spark.read.parquet( - getClass.getClassLoader.getResource("index/zorder/input-table").toString - ) + bootstrapParquetInputTableFromJSON(sourceJSONTablePath, targetParquetTablePath) + + val inputDf = spark.read.parquet(targetParquetTablePath) + + val testColumnStatsIndexPath = new Path(tempDir.resolve("zindex").toAbsolutePath.toString) + val fs = testColumnStatsIndexPath.getFileSystem(spark.sparkContext.hadoopConfiguration) // Try to save statistics ColumnStatsIndexHelper.updateColumnStatsIndexFor( inputDf.sparkSession, sourceTableSchema, - inputDf.inputFiles.toSeq, - Seq("c1","c2","c3","c5","c6","c7","c8"), - testZIndexPath.toString, + inputDf.inputFiles.toSeq.asJava, + Seq("c1","c2","c3","c5","c6","c7","c8").asJava, + testColumnStatsIndexPath.toString, "2", - Seq("0", "1") + Seq("0", "1").asJava ) // Save again ColumnStatsIndexHelper.updateColumnStatsIndexFor( inputDf.sparkSession, sourceTableSchema, - inputDf.inputFiles.toSeq, - Seq("c1","c2","c3","c5","c6","c7","c8"), - testZIndexPath.toString, + inputDf.inputFiles.toSeq.asJava, + Seq("c1","c2","c3","c5","c6","c7","c8").asJava, + testColumnStatsIndexPath.toString, "3", - Seq("0", "1", "2") + Seq("0", "1", "2").asJava ) // Test old index table being cleaned up ColumnStatsIndexHelper.updateColumnStatsIndexFor( inputDf.sparkSession, sourceTableSchema, - inputDf.inputFiles.toSeq, - Seq("c1","c2","c3","c5","c6","c7","c8"), - testZIndexPath.toString, + inputDf.inputFiles.toSeq.asJava, + Seq("c1","c2","c3","c5","c6","c7","c8").asJava, + testColumnStatsIndexPath.toString, "4", - Seq("0", "1", "3") + Seq("0", "1", "3").asJava ) - assertEquals(!fs.exists(new Path(testZIndexPath, "2")), true) - assertEquals(!fs.exists(new Path(testZIndexPath, "3")), true) - assertEquals(fs.exists(new Path(testZIndexPath, "4")), true) + assertEquals(!fs.exists(new Path(testColumnStatsIndexPath, "2")), true) + assertEquals(!fs.exists(new Path(testColumnStatsIndexPath, "3")), true) + assertEquals(fs.exists(new Path(testColumnStatsIndexPath, "4")), true) + } + + @Test + def testParquetMetadataRangeExtraction(): Unit = { + val df = generateRandomDataFrame(spark) + + val pathStr = tempDir.resolve("min-max").toAbsolutePath.toString + + df.write.format("parquet") + .mode(SaveMode.Overwrite) + .save(pathStr) + + val utils = new ParquetUtils + + val conf = new Configuration() + val path = new Path(pathStr) + val fs = path.getFileSystem(conf) + + val parquetFilePath = fs.listStatus(path).filter(fs => fs.getPath.getName.endsWith(".parquet")).toSeq.head.getPath + + val ranges = utils.readRangeFromParquetMetadata(conf, parquetFilePath, + Seq("c1", "c2", "c3a", "c3b", "c3c", "c4", "c5", "c6", "c7", "c8").asJava) + + ranges.asScala.foreach(r => { + // NOTE: Unfortunately Parquet can't compute statistics for Timestamp column, hence we + // skip it in our assertions + if (r.getColumnName.equals("c4")) { + // scalastyle:off return + return + // scalastyle:on return + } + + val min = r.getMinValue + val max = r.getMaxValue + + assertNotNull(min) + assertNotNull(max) + assertTrue(r.getMinValue.asInstanceOf[Comparable[Object]].compareTo(r.getMaxValue.asInstanceOf[Object]) <= 0) + }) } private def buildColumnStatsTableManually(tablePath: String, zorderedCols: Seq[String], indexSchema: StructType) = { @@ -268,11 +318,84 @@ class TestColumnStatsIndex extends HoodieClientTestBase { df.selectExpr(exprs: _*) .collect() - }), + }).asJava, indexSchema ) } + def bootstrapParquetInputTableFromJSON(sourceJSONTablePath: String, targetParquetTablePath: String): Unit = { + val jsonInputDF = + // NOTE: Schema here is provided for validation that the input date is in the appropriate format + spark.read + .schema(sourceTableSchema) + .json(sourceJSONTablePath) + + jsonInputDF + .sort("c1") + .repartition(4, new Column("c1")) + .write + .format("parquet") + .mode("overwrite") + .save(targetParquetTablePath) + + val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) + // Have to cleanup additional artefacts of Spark write + fs.delete(new Path(targetParquetTablePath, "_SUCCESS"), false) + } + + def replace(ds: Dataset[Row]): DataFrame = { + val uuidRegexp = "[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}" + + val uuids = + ds.selectExpr(s"regexp_extract(file, '(${uuidRegexp})')") + .distinct() + .collect() + .map(_.getString(0)) + + val uuidToIdx: UserDefinedFunction = functions.udf((fileName: String) => { + val uuid = uuids.find(uuid => fileName.contains(uuid)).get + fileName.replace(uuid, "xxx") + }) + + ds.withColumn("file", uuidToIdx(ds("file"))) + } + + private def generateRandomDataFrame(spark: SparkSession): DataFrame = { + val sourceTableSchema = + new StructType() + .add("c1", IntegerType) + .add("c2", StringType) + // NOTE: We're testing different values for precision of the decimal to make sure + // we execute paths bearing different underlying representations in Parquet + // REF: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#DECIMAL + .add("c3a", DecimalType(9,3)) + .add("c3b", DecimalType(10,3)) + .add("c3c", DecimalType(20,3)) + .add("c4", TimestampType) + .add("c5", ShortType) + .add("c6", DateType) + .add("c7", BinaryType) + .add("c8", ByteType) + + val rdd = spark.sparkContext.parallelize(0 to 1000, 1).map { item => + val c1 = Integer.valueOf(item) + val c2 = Random.nextString(10) + val c3a = java.math.BigDecimal.valueOf(Random.nextInt() % (1 << 24), 3) + val c3b = java.math.BigDecimal.valueOf(Random.nextLong() % (1L << 32), 3) + // NOTE: We cap it at 2^64 to make sure we're not exceeding target decimal's range + val c3c = new java.math.BigDecimal(new BigInteger(64, new java.util.Random()), 3) + val c4 = new Timestamp(System.currentTimeMillis()) + val c5 = java.lang.Short.valueOf(s"${(item + 16) / 10}") + val c6 = Date.valueOf(s"${2020}-${item % 11 + 1}-${item % 28 + 1}") + val c7 = Array(item).map(_.toByte) + val c8 = java.lang.Byte.valueOf("9") + + RowFactory.create(c1, c2, c3a, c3b, c3c, c4, c5, c6, c7, c8) + } + + spark.createDataFrame(rdd, sourceTableSchema) + } + private def asJson(df: DataFrame) = df.toJSON .select("value") @@ -281,14 +404,12 @@ class TestColumnStatsIndex extends HoodieClientTestBase { .map(_.getString(0)) .mkString("\n") - private def sort(df: DataFrame): DataFrame = { - // Since upon parsing JSON, Spark re-order columns in lexicographical order - // of their names, we have to shuffle new Z-index table columns order to match - // Rows are sorted by filename as well to avoid val sortedCols = df.columns.sorted + // Sort dataset by the first 2 columns (to minimize non-determinism in case multiple files have the same + // value of the first column) df.select(sortedCols.head, sortedCols.tail: _*) - .sort("file") + .sort("c1_maxValue", "c1_minValue") } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSpaceCurveLayoutOptimization.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestLayoutOptimization.scala similarity index 81% rename from hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSpaceCurveLayoutOptimization.scala rename to hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestLayoutOptimization.scala index e453953ff11e2..818addaf87399 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSpaceCurveLayoutOptimization.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestLayoutOptimization.scala @@ -32,12 +32,10 @@ import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.Arguments.arguments import org.junit.jupiter.params.provider.{Arguments, MethodSource} -import java.sql.{Date, Timestamp} import scala.collection.JavaConversions._ -import scala.util.Random @Tag("functional") -class TestSpaceCurveLayoutOptimization extends HoodieClientTestBase { +class TestLayoutOptimization extends HoodieClientTestBase { var spark: SparkSession = _ val sourceTableSchema = @@ -79,7 +77,13 @@ class TestSpaceCurveLayoutOptimization extends HoodieClientTestBase { @ParameterizedTest @MethodSource(Array("testLayoutOptimizationParameters")) - def testLayoutOptimizationFunctional(tableType: String): Unit = { + def testLayoutOptimizationFunctional(tableType: String, + layoutOptimizationStrategy: String, + spatialCurveCompositionStrategy: String): Unit = { + val curveCompositionStrategy = + Option(spatialCurveCompositionStrategy) + .getOrElse(HoodieClusteringConfig.LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD.defaultValue()) + val targetRecordsCount = 10000 // Bulk Insert Operation val records = recordsToStrings(dataGen.generateInserts("001", targetRecordsCount)).toList @@ -98,8 +102,9 @@ class TestSpaceCurveLayoutOptimization extends HoodieClientTestBase { .option("hoodie.clustering.plan.strategy.small.file.limit", "629145600") .option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString) .option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(64 * 1024 * 1024L)) - .option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE.key, "true") - .option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "begin_lat, begin_lon") + .option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY.key(), layoutOptimizationStrategy) + .option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD.key(), curveCompositionStrategy) + .option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "begin_lat,begin_lon") .mode(SaveMode.Overwrite) .save(basePath) @@ -144,32 +149,22 @@ class TestSpaceCurveLayoutOptimization extends HoodieClientTestBase { val rows = one.count() assert(rows == other.count() && one.intersect(other).count() == rows) } - - def createComplexDataFrame(spark: SparkSession): DataFrame = { - val rdd = spark.sparkContext.parallelize(0 to 1000, 1).map { item => - val c1 = Integer.valueOf(item) - val c2 = s" ${item}sdc" - val c3 = new java.math.BigDecimal(s"${Random.nextInt(1000)}.${item}") - val c4 = new Timestamp(System.currentTimeMillis()) - val c5 = java.lang.Short.valueOf(s"${(item + 16) /10}") - val c6 = Date.valueOf(s"${2020}-${item % 11 + 1}-${item % 28 + 1}") - val c7 = Array(item).map(_.toByte) - val c8 = java.lang.Byte.valueOf("9") - - RowFactory.create(c1, c2, c3, c4, c5, c6, c7, c8) - } - spark.createDataFrame(rdd, sourceTableSchema) - } } -object TestSpaceCurveLayoutOptimization { +object TestLayoutOptimization { def testLayoutOptimizationParameters(): java.util.stream.Stream[Arguments] = { java.util.stream.Stream.of( - arguments("COPY_ON_WRITE", "hilbert"), - arguments("COPY_ON_WRITE", "z-order"), - arguments("MERGE_ON_READ", "hilbert"), - arguments("MERGE_ON_READ", "z-order") + arguments("COPY_ON_WRITE", "linear", null), + arguments("COPY_ON_WRITE", "z-order", "direct"), + arguments("COPY_ON_WRITE", "z-order", "sample"), + arguments("COPY_ON_WRITE", "hilbert", "direct"), + arguments("COPY_ON_WRITE", "hilbert", "sample"), + + arguments("MERGE_ON_READ", "linear", null), + arguments("MERGE_ON_READ", "z-order", "direct"), + arguments("MERGE_ON_READ", "z-order", "sample"), + arguments("MERGE_ON_READ", "hilbert", "direct"), + arguments("MERGE_ON_READ", "hilbert", "sample") ) } } - diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala index f420b296e2b3a..ed6ef87b8e14f 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMORDataSource.scala @@ -17,19 +17,22 @@ package org.apache.hudi.functional +import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.Path import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.common.config.HoodieMetadataConfig -import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieTableType} +import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieRecord, HoodieRecordPayload, HoodieTableType} import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig} import org.apache.hudi.index.HoodieIndex.IndexType import org.apache.hudi.keygen.NonpartitionedKeyGenerator +import org.apache.hudi.keygen.constant.KeyGeneratorOptions.Config import org.apache.hudi.testutils.{DataSourceTestUtils, HoodieClientTestBase} -import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers} +import org.apache.hudi.{AvroConversionUtils, DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers, HoodieSparkUtils} import org.apache.log4j.LogManager +import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.BooleanType @@ -38,6 +41,7 @@ import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.CsvSource +import java.util import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ @@ -345,11 +349,15 @@ class TestMORDataSource extends HoodieClientTestBase { // First Operation: // Producing parquet files to three default partitions. // SNAPSHOT view on MOR table with parquet files only. + + // Overriding the partition-path field + val opts = commonOpts + (DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition_path") + val hoodieRecords1 = dataGen.generateInserts("001", 100) - val records1 = recordsToStrings(hoodieRecords1).toList - val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) + + val inputDF1 = toDataset(hoodieRecords1) inputDF1.write.format("org.apache.hudi") - .options(commonOpts) + .options(opts) .option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) .option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL) @@ -372,11 +380,10 @@ class TestMORDataSource extends HoodieClientTestBase { // Second Operation: // Upsert 50 update records // Snopshot view should read 100 records - val records2 = recordsToStrings(dataGen.generateUniqueUpdates("002", 50)) - .toList - val inputDF2: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records2, 2)) + val records2 = dataGen.generateUniqueUpdates("002", 50) + val inputDF2 = toDataset(records2) inputDF2.write.format("org.apache.hudi") - .options(commonOpts) + .options(opts) .mode(SaveMode.Append) .save(basePath) val hudiSnapshotDF2 = spark.read.format("org.apache.hudi") @@ -420,17 +427,31 @@ class TestMORDataSource extends HoodieClientTestBase { verifyShow(hudiIncDF2) verifyShow(hudiIncDF1Skipmerge) - val record3 = recordsToStrings(dataGen.generateUpdatesWithTS("003", hoodieRecords1, -1)) - spark.read.json(spark.sparkContext.parallelize(record3, 2)) - .write.format("org.apache.hudi").options(commonOpts) + val record3 = dataGen.generateUpdatesWithTS("003", hoodieRecords1, -1) + val inputDF3 = toDataset(record3) + inputDF3.write.format("org.apache.hudi").options(opts) .mode(SaveMode.Append).save(basePath) + val hudiSnapshotDF3 = spark.read.format("org.apache.hudi") .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL) .load(basePath + "/*/*/*/*") + + verifyShow(hudiSnapshotDF3); + assertEquals(100, hudiSnapshotDF3.count()) assertEquals(0, hudiSnapshotDF3.filter("rider = 'rider-003'").count()) } + private def toDataset(records: util.List[HoodieRecord[_]]) = { + val avroRecords = records.map(_.getData + .asInstanceOf[HoodieRecordPayload[_]] + .getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA) + .get + .asInstanceOf[GenericRecord]) + val rdd: RDD[GenericRecord] = spark.sparkContext.parallelize(avroRecords, 2) + AvroConversionUtils.createDataFrame(rdd, HoodieTestDataGenerator.AVRO_SCHEMA.toString, spark) + } + @Test def testVectorizedReader() { spark.conf.set("spark.sql.parquet.enableVectorizedReader", true) @@ -549,10 +570,10 @@ class TestMORDataSource extends HoodieClientTestBase { .orderBy(desc("_hoodie_commit_time")) .head() assertEquals(sampleRow.getDouble(0), sampleRow.get(0)) - assertEquals(sampleRow.getLong(1), sampleRow.get(1)) + assertEquals(sampleRow.getDate(1), sampleRow.get(1)) assertEquals(sampleRow.getString(2), sampleRow.get(2)) assertEquals(sampleRow.getSeq(3), sampleRow.get(3)) - assertEquals(sampleRow.getStruct(4), sampleRow.get(4)) + assertEquals(sampleRow.getAs[Array[Byte]](4), sampleRow.get(4)) } def verifyShow(df: DataFrame): Unit = { @@ -770,4 +791,79 @@ class TestMORDataSource extends HoodieClientTestBase { .load(basePath + "/*/*/*/*") assertEquals(numRecords - numRecordsToDelete, snapshotDF2.count()) } + + /** + * This tests the case that query by with a specified partition condition on hudi table which is + * different between the value of the partition field and the actual partition path, + * like hudi table written by TimestampBasedKeyGenerator. + * + * For MOR table, test all the three query modes. + */ + @Test + def testPrunePartitionForTimestampBasedKeyGenerator(): Unit = { + val options = commonOpts ++ Map( + "hoodie.compact.inline" -> "false", + DataSourceWriteOptions.TABLE_TYPE.key -> DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL, + DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> "org.apache.hudi.keygen.TimestampBasedKeyGenerator", + Config.TIMESTAMP_TYPE_FIELD_PROP -> "DATE_STRING", + Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP -> "yyyy/MM/dd", + Config.TIMESTAMP_TIMEZONE_FORMAT_PROP -> "GMT+8:00", + Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP -> "yyyy-MM-dd" + ) + + val dataGen1 = new HoodieTestDataGenerator(Array("2022-01-01")) + val records1 = recordsToStrings(dataGen1.generateInserts("001", 50)).toList + val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) + inputDF1.write.format("org.apache.hudi") + .options(options) + .mode(SaveMode.Overwrite) + .save(basePath) + metaClient = HoodieTableMetaClient.builder() + .setBasePath(basePath) + .setConf(spark.sessionState.newHadoopConf) + .build() + val commit1Time = metaClient.getActiveTimeline.lastInstant().get().getTimestamp + + val dataGen2 = new HoodieTestDataGenerator(Array("2022-01-02")) + val records2 = recordsToStrings(dataGen2.generateInserts("002", 60)).toList + val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) + inputDF2.write.format("org.apache.hudi") + .options(options) + .mode(SaveMode.Append) + .save(basePath) + val commit2Time = metaClient.reloadActiveTimeline.lastInstant().get().getTimestamp + + val records3 = recordsToStrings(dataGen2.generateUniqueUpdates("003", 20)).toList + val inputDF3 = spark.read.json(spark.sparkContext.parallelize(records3, 2)) + inputDF3.write.format("org.apache.hudi") + .options(options) + .mode(SaveMode.Append) + .save(basePath) + val commit3Time = metaClient.reloadActiveTimeline.lastInstant().get().getTimestamp + + // snapshot query + val snapshotQueryRes = spark.read.format("hudi").load(basePath) + assertEquals(snapshotQueryRes.where(s"_hoodie_commit_time = '$commit1Time'").count, 50) + assertEquals(snapshotQueryRes.where(s"_hoodie_commit_time = '$commit2Time'").count, 40) + assertEquals(snapshotQueryRes.where(s"_hoodie_commit_time = '$commit3Time'").count, 20) + + assertEquals(snapshotQueryRes.where("partition = '2022-01-01'").count, 50) + assertEquals(snapshotQueryRes.where("partition = '2022-01-02'").count, 60) + + // read_optimized query + val readOptimizedQueryRes = spark.read.format("hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL) + .load(basePath) + assertEquals(readOptimizedQueryRes.where("partition = '2022-01-01'").count, 50) + assertEquals(readOptimizedQueryRes.where("partition = '2022-01-02'").count, 60) + + // incremental query + val incrementalQueryRes = spark.read.format("hudi") + .option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit2Time) + .option(DataSourceReadOptions.END_INSTANTTIME.key, commit3Time) + .load(basePath) + assertEquals(incrementalQueryRes.where("partition = '2022-01-01'").count, 0) + assertEquals(incrementalQueryRes.where("partition = '2022-01-02'").count, 20) + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMetadataTableWithSparkDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMetadataTableWithSparkDataSource.scala new file mode 100644 index 0000000000000..918202e974682 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestMetadataTableWithSparkDataSource.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.functional + +import org.apache.hudi.DataSourceWriteOptions +import org.apache.hudi.common.config.HoodieMetadataConfig +import org.apache.hudi.common.testutils.HoodieTestDataGenerator +import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings +import org.apache.hudi.config.HoodieWriteConfig +import org.apache.hudi.testutils.SparkClientFunctionalTestHarness +import org.apache.spark.sql.SaveMode +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.{Tag, Test} + +import scala.collection.JavaConverters._ + +@Tag("functional") +class TestMetadataTableWithSparkDataSource extends SparkClientFunctionalTestHarness { + + val hudi = "org.apache.hudi" + var commonOpts = Map( + "hoodie.insert.shuffle.parallelism" -> "4", + "hoodie.upsert.shuffle.parallelism" -> "4", + "hoodie.bulkinsert.shuffle.parallelism" -> "2", + "hoodie.delete.shuffle.parallelism" -> "1", + DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key", + DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition", + DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp", + HoodieWriteConfig.TBL_NAME.key -> "hoodie_test" + ) + + @Test + def testReadability(): Unit = { + val dataGen = new HoodieTestDataGenerator() + + val opts: Map[String, String] = commonOpts ++ Map( + HoodieMetadataConfig.ENABLE.key -> "true", + HoodieMetadataConfig.COMPACT_NUM_DELTA_COMMITS.key -> "1" + ) + + // Insert records + val newRecords = dataGen.generateInserts("001", 100) + val newRecordsDF = parseRecords(recordsToStrings(newRecords).asScala) + + newRecordsDF.write.format(hudi) + .options(opts) + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) + .mode(SaveMode.Append) + .save(basePath) + + // Update records + val updatedRecords = dataGen.generateUpdates("002", newRecords) + val updatedRecordsDF = parseRecords(recordsToStrings(updatedRecords).asScala) + + updatedRecordsDF.write.format(hudi) + .options(opts) + .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL) + .mode(SaveMode.Append) + .save(basePath) + + val metadataDF = spark.read.format(hudi).load(s"$basePath/.hoodie/metadata") + + // Smoke test + metadataDF.show() + + // Query w/ 0 requested columns should be working fine + assertEquals(4, metadataDF.count()) + + val expectedKeys = Seq("2015/03/16", "2015/03/17", "2016/03/15", "__all_partitions__") + val keys = metadataDF.select("key") + .collect() + .map(_.getString(0)) + .toSeq + .sorted + + assertEquals(expectedKeys, keys) + } + + private def parseRecords(records: Seq[String]) = { + spark.read.json(spark.sparkContext.parallelize(records, 2)) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTable.scala index 469b135959846..0f2cb547c2fe9 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTable.scala @@ -91,9 +91,10 @@ class TestAlterTable extends TestHoodieSqlBase { ) // change column's data type - spark.sql(s"alter table $newTableName change column id id bigint") - assertResult(StructType(Seq(StructField("id", LongType, nullable = true))))( - spark.sql(s"select id from $newTableName").schema) + checkExceptionContain(s"alter table $newTableName change column id id bigint") ( + "ALTER TABLE CHANGE COLUMN is not supported for changing column 'id'" + + " with type 'IntegerType' to 'id' with type 'LongType'" + ) // Insert data to the new table. spark.sql(s"insert into $newTableName values(2, 'a2', 12, 1000, 'e0')") diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCallCommandParser.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCallCommandParser.scala new file mode 100644 index 0000000000000..9d1c02ad99faa --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCallCommandParser.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +import com.google.common.collect.ImmutableList +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.plans.logical.{CallCommand, NamedArgument, PositionalArgument} +import org.apache.spark.sql.types.{DataType, DataTypes} + +import java.math.BigDecimal +import scala.collection.JavaConverters + +class TestCallCommandParser extends TestHoodieSqlBase { + private val parser = spark.sessionState.sqlParser + + test("Test Call Produce with Positional Arguments") { + val call = parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)").asInstanceOf[CallCommand] + assertResult(ImmutableList.of("c", "n", "func"))(JavaConverters.seqAsJavaListConverter(call.name).asJava) + + assertResult(7)(call.args.size) + + checkArg(call, 0, 1, DataTypes.IntegerType) + checkArg(call, 1, "2", DataTypes.StringType) + checkArg(call, 2, 3L, DataTypes.LongType) + checkArg(call, 3, true, DataTypes.BooleanType) + checkArg(call, 4, 1.0D, DataTypes.DoubleType) + checkArg(call, 5, new BigDecimal("9.0e1"), DataTypes.createDecimalType(2, 0)) + checkArg(call, 6, new BigDecimal("900e-1"), DataTypes.createDecimalType(3, 1)) + } + + test("Test Call Produce with Named Arguments") { + val call = parser.parsePlan("CALL system.func(c1 => 1, c2 => '2', c3 => true)").asInstanceOf[CallCommand] + assertResult(ImmutableList.of("system", "func"))(JavaConverters.seqAsJavaListConverter(call.name).asJava) + + assertResult(3)(call.args.size) + + checkArg(call, 0, "c1", 1, DataTypes.IntegerType) + checkArg(call, 1, "c2", "2", DataTypes.StringType) + checkArg(call, 2, "c3", true, DataTypes.BooleanType) + } + + test("Test Call Produce with Var Substitution") { + val call = parser.parsePlan("CALL system.func('${spark.extra.prop}')").asInstanceOf[CallCommand] + assertResult(ImmutableList.of("system", "func"))(JavaConverters.seqAsJavaListConverter(call.name).asJava) + + assertResult(1)(call.args.size) + + checkArg(call, 0, "value", DataTypes.StringType) + } + + test("Test Call Produce with Mixed Arguments") { + val call = parser.parsePlan("CALL system.func(c1 => 1, '2')").asInstanceOf[CallCommand] + assertResult(ImmutableList.of("system", "func"))(JavaConverters.seqAsJavaListConverter(call.name).asJava) + + assertResult(2)(call.args.size) + + checkArg(call, 0, "c1", 1, DataTypes.IntegerType) + checkArg(call, 1, "2", DataTypes.StringType) + } + + test("Test Call Parse Error") { + checkParseExceptionContain("CALL cat.system radish kebab")("mismatched input 'CALL' expecting") + } + + protected def checkParseExceptionContain(sql: String)(errorMsg: String): Unit = { + var hasException = false + try { + parser.parsePlan(sql) + } catch { + case e: Throwable => + assertResult(true)(e.getMessage.contains(errorMsg)) + hasException = true + } + assertResult(true)(hasException) + } + + private def checkArg(call: CallCommand, index: Int, expectedValue: Any, expectedType: DataType): Unit = { + checkArg(call, index, null, expectedValue, expectedType) + } + + private def checkArg(call: CallCommand, index: Int, expectedName: String, expectedValue: Any, expectedType: DataType): Unit = { + if (expectedName != null) { + val arg = checkCast(call.args.apply(index), classOf[NamedArgument]) + assertResult(expectedName)(arg.name) + } + else { + val arg = call.args.apply(index) + checkCast(arg, classOf[PositionalArgument]) + } + val expectedExpr = toSparkLiteral(expectedValue, expectedType) + val actualExpr = call.args.apply(index).expr + assertResult(expectedExpr.dataType)(actualExpr.dataType) + } + + private def toSparkLiteral(value: Any, dataType: DataType) = Literal.apply(value, dataType) + + private def checkCast[T](value: Any, expectedClass: Class[T]) = { + assertResult(true)(expectedClass.isInstance(value)) + expectedClass.cast(value) + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCallProcedure.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCallProcedure.scala new file mode 100644 index 0000000000000..eb2c614df201b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestCallProcedure.scala @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi + +class TestCallProcedure extends TestHoodieSqlBase { + + test("Test Call show_commits Procedure") { + withTempDir { tmp => + val tableName = generateTableName + // create table + spark.sql( + s""" + |create table $tableName ( + | id int, + | name string, + | price double, + | ts long + |) using hudi + | location '${tmp.getCanonicalPath}/$tableName' + | tblproperties ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + """.stripMargin) + // insert data to table + spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000") + spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500") + + // Check required fields + checkExceptionContain(s"""call show_commits(limit => 10)""")( + s"Argument: table is required") + + // collect commits for table + val commits = spark.sql(s"""call show_commits(table => '$tableName', limit => 10)""").collect() + assertResult(2) { + commits.length + } + } + } + + test("Test Call show_commits_metadata Procedure") { + withTempDir { tmp => + val tableName = generateTableName + // create table + spark.sql( + s""" + |create table $tableName ( + | id int, + | name string, + | price double, + | ts long + |) using hudi + | location '${tmp.getCanonicalPath}/$tableName' + | tblproperties ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + """.stripMargin) + // insert data to table + spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000") + + // Check required fields + checkExceptionContain(s"""call show_commits_metadata(limit => 10)""")( + s"Argument: table is required") + + // collect commits for table + val commits = spark.sql(s"""call show_commits_metadata(table => '$tableName', limit => 10)""").collect() + assertResult(1) { + commits.length + } + } + } + + test("Test Call rollback_to_instant Procedure") { + withTempDir { tmp => + val tableName = generateTableName + // create table + spark.sql( + s""" + |create table $tableName ( + | id int, + | name string, + | price double, + | ts long + |) using hudi + | location '${tmp.getCanonicalPath}/$tableName' + | tblproperties ( + | primaryKey = 'id', + | preCombineField = 'ts' + | ) + """.stripMargin) + // insert data to table + spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000") + spark.sql(s"insert into $tableName select 2, 'a2', 20, 1500") + spark.sql(s"insert into $tableName select 3, 'a3', 30, 2000") + + // Check required fields + checkExceptionContain(s"""call rollback_to_instant(table => '$tableName')""")( + s"Argument: instant_time is required") + + // 3 commits are left before rollback + var commits = spark.sql(s"""call show_commits(table => '$tableName', limit => 10)""").collect() + assertResult(3){commits.length} + + // Call rollback_to_instant Procedure with Named Arguments + var instant_time = commits(0).get(0).toString + checkAnswer(s"""call rollback_to_instant(table => '$tableName', instant_time => '$instant_time')""")(Seq(true)) + // Call rollback_to_instant Procedure with Positional Arguments + instant_time = commits(1).get(0).toString + checkAnswer(s"""call rollback_to_instant('$tableName', '$instant_time')""")(Seq(true)) + + // 1 commits are left after rollback + commits = spark.sql(s"""call show_commits(table => '$tableName', limit => 10)""").collect() + assertResult(1){commits.length} + } + } +} diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieSqlBase.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieSqlBase.scala index a5b49cc3683d0..ca3919599b6fa 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieSqlBase.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestHoodieSqlBase.scala @@ -18,8 +18,10 @@ package org.apache.spark.sql.hudi import org.apache.hadoop.fs.Path +import org.apache.hudi.HoodieSparkUtils import org.apache.hudi.common.fs.FSUtils import org.apache.log4j.Level +import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.util.Utils @@ -49,10 +51,20 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll { .config("hoodie.delete.shuffle.parallelism", "4") .config("spark.sql.warehouse.dir", sparkWareHouse.getCanonicalPath) .config("spark.sql.session.timeZone", "CTT") + .config(sparkConf()) .getOrCreate() private var tableId = 0 + def sparkConf(): SparkConf = { + val sparkConf = new SparkConf() + if (HoodieSparkUtils.gteqSpark3_2) { + sparkConf.set("spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.hudi.catalog.HoodieCatalog") + } + sparkConf + } + protected def withTempDir(f: File => Unit): Unit = { val tempDir = Utils.createTempDir() try f(tempDir) finally { diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala index 4d12d987ff3eb..b186381c25203 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala @@ -17,8 +17,12 @@ package org.apache.spark.sql.hudi -import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.DataSourceWriteOptions.{KEYGENERATOR_CLASS_NAME, MOR_TABLE_TYPE_OPT_VAL, PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD, TABLE_TYPE} +import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} +import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.exception.HoodieDuplicateKeyException +import org.apache.hudi.keygen.ComplexKeyGenerator +import org.apache.spark.sql.SaveMode import java.io.File @@ -582,8 +586,48 @@ class TestInsertTable extends TestHoodieSqlBase { checkAnswer(s"select id, name, price, ts from $tableName")( Seq(1, "a1", 11.0, 1000) ) - } } + test("Test For read operation's field") { + withTempDir { tmp => { + val tableName = generateTableName + val tablePath = s"${tmp.getCanonicalPath}/$tableName" + import spark.implicits._ + val day = "2021-08-02" + val df = Seq((1, "a1", 10, 1000, day, 12)).toDF("id", "name", "value", "ts", "day", "hh") + // Write a table by spark dataframe. + df.write.format("hudi") + .option(HoodieWriteConfig.TBL_NAME.key, tableName) + .option(TABLE_TYPE.key, MOR_TABLE_TYPE_OPT_VAL) + .option(RECORDKEY_FIELD.key, "id") + .option(PRECOMBINE_FIELD.key, "ts") + .option(PARTITIONPATH_FIELD.key, "day,hh") + .option(KEYGENERATOR_CLASS_NAME.key, classOf[ComplexKeyGenerator].getName) + .option(HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key, "1") + .option(HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key, "1") + .option(HoodieWriteConfig.ALLOW_OPERATION_METADATA_FIELD.key, "true") + .mode(SaveMode.Overwrite) + .save(tablePath) + + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(tablePath) + .setConf(spark.sessionState.newHadoopConf()) + .build() + + assertResult(true)(new TableSchemaResolver(metaClient).isHasOperationField) + + spark.sql( + s""" + |create table $tableName using hudi + |location '${tablePath}' + |""".stripMargin) + + // Note: spark sql batch write currently does not write actual content to the operation field + checkAnswer(s"select id, _hoodie_operation from $tableName")( + Seq(1, null) + ) + } + } + } } diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable.scala index baac82f4bd153..28dee88e1f61e 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestMergeIntoTable.scala @@ -87,7 +87,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase { | on s0.id = $tableName.id | when matched then update set | id = s0.id, name = s0.name, price = s0.price + $tableName.price, ts = s0.ts - | when not matched and id % 2 = 0 then insert * + | when not matched and s0.id % 2 = 0 then insert * """.stripMargin) checkAnswer(s"select id, name, price, ts from $tableName")( Seq(1, "a1", 30.0, 1002), @@ -102,9 +102,9 @@ class TestMergeIntoTable extends TestHoodieSqlBase { | select 1 as id, 'a1' as name, 12 as price, 1003 as ts | ) s0 | on s0.id = $tableName.id - | when matched and id != 1 then update set + | when matched and s0.id != 1 then update set | id = s0.id, name = s0.name, price = s0.price, ts = s0.ts - | when matched and id = 1 then delete + | when matched and s0.id = 1 then delete | when not matched then insert * """.stripMargin) val cnt = spark.sql(s"select * from $tableName where id = 1").count() @@ -178,7 +178,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase { | ) | ) s0 | on s0.s_id = t0.id - | when matched and ts = 1001 then update set id = s0.s_id, name = t0.name, price = + | when matched and s0.ts = 1001 then update set id = s0.s_id, name = t0.name, price = | s0.price, ts = s0.ts """.stripMargin ) @@ -233,7 +233,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase { | select 1 as id, 'a1' as name, 12 as price, 1001 as ts, '2021-03-21' as dt | ) as s0 | on t0.id = s0.id - | when matched and id % 2 = 0 then update set * + | when matched and s0.id % 2 = 0 then update set * """.stripMargin ) checkAnswer(s"select id,name,price,dt from $tableName")( @@ -488,7 +488,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase { |merge into $targetTable t0 |using $sourceTable s0 |on t0.id = s0.id - |when matched and cast(_ts as string) > '1000' then update set * + |when matched and cast(s0._ts as string) > '1000' then update set * """.stripMargin) checkAnswer(s"select id, name, price, _ts from $targetTable")( Seq(1, "a1", 12, 1001) @@ -512,7 +512,7 @@ class TestMergeIntoTable extends TestHoodieSqlBase { |using $sourceTable s0 |on t0.id = s0.id |when matched then update set * - |when not matched and name = 'a2' then insert * + |when not matched and s0.name = 'a2' then insert * """.stripMargin) checkAnswer(s"select id, name, price, _ts from $targetTable order by id")( Seq(1, "a1", 12, 1001), diff --git a/hudi-spark-datasource/hudi-spark2/pom.xml b/hudi-spark-datasource/hudi-spark2/pom.xml index 2283603542ee8..3fb6cf3dd65ba 100644 --- a/hudi-spark-datasource/hudi-spark2/pom.xml +++ b/hudi-spark-datasource/hudi-spark2/pom.xml @@ -199,6 +199,15 @@ org.apache.spark spark-sql_${scala.binary.version} ${spark2.version} + provided + true + + + + org.apache.spark + spark-avro_${scala.binary.version} + ${spark2.version} + provided true diff --git a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala index bf1cd24484c1a..5dfa7d9574d9a 100644 --- a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala +++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/adapter/Spark2Adapter.scala @@ -17,8 +17,10 @@ package org.apache.spark.sql.adapter +import org.apache.avro.Schema import org.apache.hudi.Spark2RowSerDe import org.apache.hudi.client.utils.SparkRowSerDe +import org.apache.spark.sql.avro.{HoodieAvroDeserializerTrait, HoodieAvroSerializerTrait, Spark2HoodieAvroDeserializer, HoodieAvroSerializer} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, Like} @@ -26,17 +28,26 @@ import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Join, LogicalPlan} import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} -import org.apache.spark.sql.execution.datasources.{Spark2ParsePartitionUtil, SparkParsePartitionUtil} +import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile, Spark2ParsePartitionUtil, SparkParsePartitionUtil} import org.apache.spark.sql.hudi.SparkAdapter import org.apache.spark.sql.hudi.parser.HoodieSpark2ExtendedSqlParser import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SparkSession} +import scala.collection.mutable.ArrayBuffer + /** * The adapter for spark2. */ class Spark2Adapter extends SparkAdapter { + def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializerTrait = + new HoodieAvroSerializer(rootCatalystType, rootAvroType, nullable) + + def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializerTrait = + new Spark2HoodieAvroDeserializer(rootAvroType, rootCatalystType) + override def createSparkRowSerDe(encoder: ExpressionEncoder[Row]): SparkRowSerDe = { new Spark2RowSerDe(encoder) } @@ -86,4 +97,44 @@ class Spark2Adapter extends SparkAdapter { override def parseMultipartIdentifier(parser: ParserInterface, sqlText: String): Seq[String] = { throw new IllegalStateException(s"Should not call ParserInterface#parseMultipartIdentifier for spark2") } + + /** + * Combine [[PartitionedFile]] to [[FilePartition]] according to `maxSplitBytes`. + * + * This is a copy of org.apache.spark.sql.execution.datasources.FilePartition#getFilePartitions from Spark 3.2. + * And this will be called only in Spark 2. + */ + override def getFilePartitions( + sparkSession: SparkSession, + partitionedFiles: Seq[PartitionedFile], + maxSplitBytes: Long): Seq[FilePartition] = { + + val partitions = new ArrayBuffer[FilePartition] + val currentFiles = new ArrayBuffer[PartitionedFile] + var currentSize = 0L + + /** Close the current partition and move to the next. */ + def closePartition(): Unit = { + if (currentFiles.nonEmpty) { + // Copy to a new Array. + val newPartition = FilePartition(partitions.size, currentFiles.toArray) + partitions += newPartition + } + currentFiles.clear() + currentSize = 0 + } + + val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes + // Assign files to partitions using "Next Fit Decreasing" + partitionedFiles.foreach { file => + if (currentSize + file.length > maxSplitBytes) { + closePartition() + } + // Add the given file to the current partition. + currentSize += file.length + openCostInBytes + currentFiles += file + } + closePartition() + partitions.toSeq + } } diff --git a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/avro/PatchedAvroDeserializer.scala b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/avro/PatchedAvroDeserializer.scala new file mode 100644 index 0000000000000..8d9948c58cdd8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/avro/PatchedAvroDeserializer.scala @@ -0,0 +1,398 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.Conversions.DecimalConversion +import org.apache.avro.LogicalTypes.{TimestampMicros, TimestampMillis} +import org.apache.avro.Schema.Type._ +import org.apache.avro.generic._ +import org.apache.avro.util.Utf8 +import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +import java.math.BigDecimal +import java.nio.ByteBuffer +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +/** + * A deserializer to deserialize data in avro format to data in catalyst format. + * + * NOTE: This is a version of {@code AvroDeserializer} impl from Spark 2.4.4 w/ the fix for SPARK-30267 + * applied on top of it + */ +class PatchedAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) { + private lazy val decimalConversions = new DecimalConversion() + + private val converter: Any => Any = rootCatalystType match { + // A shortcut for empty schema. + case st: StructType if st.isEmpty => + (data: Any) => InternalRow.empty + + case st: StructType => + val resultRow = new SpecificInternalRow(st.map(_.dataType)) + val fieldUpdater = new RowUpdater(resultRow) + val writer = getRecordWriter(rootAvroType, st, Nil) + (data: Any) => { + val record = data.asInstanceOf[GenericRecord] + writer(fieldUpdater, record) + resultRow + } + + case _ => + val tmpRow = new SpecificInternalRow(Seq(rootCatalystType)) + val fieldUpdater = new RowUpdater(tmpRow) + val writer = newWriter(rootAvroType, rootCatalystType, Nil) + (data: Any) => { + writer(fieldUpdater, 0, data) + tmpRow.get(0, rootCatalystType) + } + } + + def deserialize(data: Any): Any = converter(data) + + /** + * Creates a writer to write avro values to Catalyst values at the given ordinal with the given + * updater. + */ + private def newWriter( + avroType: Schema, + catalystType: DataType, + path: List[String]): (CatalystDataUpdater, Int, Any) => Unit = + (avroType.getType, catalystType) match { + case (NULL, NullType) => (updater, ordinal, _) => + updater.setNullAt(ordinal) + + // TODO: we can avoid boxing if future version of avro provide primitive accessors. + case (BOOLEAN, BooleanType) => (updater, ordinal, value) => + updater.setBoolean(ordinal, value.asInstanceOf[Boolean]) + + case (INT, IntegerType) => (updater, ordinal, value) => + updater.setInt(ordinal, value.asInstanceOf[Int]) + + case (INT, DateType) => (updater, ordinal, value) => + updater.setInt(ordinal, value.asInstanceOf[Int]) + + case (LONG, LongType) => (updater, ordinal, value) => + updater.setLong(ordinal, value.asInstanceOf[Long]) + + case (LONG, TimestampType) => avroType.getLogicalType match { + case _: TimestampMillis => (updater, ordinal, value) => + updater.setLong(ordinal, value.asInstanceOf[Long] * 1000) + case _: TimestampMicros => (updater, ordinal, value) => + updater.setLong(ordinal, value.asInstanceOf[Long]) + case null => (updater, ordinal, value) => + // For backward compatibility, if the Avro type is Long and it is not logical type, + // the value is processed as timestamp type with millisecond precision. + updater.setLong(ordinal, value.asInstanceOf[Long] * 1000) + case other => throw new IncompatibleSchemaException( + s"Cannot convert Avro logical type ${other} to Catalyst Timestamp type.") + } + + // Before we upgrade Avro to 1.8 for logical type support, spark-avro converts Long to Date. + // For backward compatibility, we still keep this conversion. + case (LONG, DateType) => (updater, ordinal, value) => + updater.setInt(ordinal, (value.asInstanceOf[Long] / DateTimeUtils.MILLIS_PER_DAY).toInt) + + case (FLOAT, FloatType) => (updater, ordinal, value) => + updater.setFloat(ordinal, value.asInstanceOf[Float]) + + case (DOUBLE, DoubleType) => (updater, ordinal, value) => + updater.setDouble(ordinal, value.asInstanceOf[Double]) + + case (STRING, StringType) => (updater, ordinal, value) => + val str = value match { + case s: String => UTF8String.fromString(s) + case s: Utf8 => + val bytes = new Array[Byte](s.getByteLength) + System.arraycopy(s.getBytes, 0, bytes, 0, s.getByteLength) + UTF8String.fromBytes(bytes) + } + updater.set(ordinal, str) + + case (ENUM, StringType) => (updater, ordinal, value) => + updater.set(ordinal, UTF8String.fromString(value.toString)) + + case (FIXED, BinaryType) => (updater, ordinal, value) => + updater.set(ordinal, value.asInstanceOf[GenericFixed].bytes().clone()) + + case (BYTES, BinaryType) => (updater, ordinal, value) => + val bytes = value match { + case b: ByteBuffer => + val bytes = new Array[Byte](b.remaining) + b.get(bytes) + bytes + case b: Array[Byte] => b + case other => throw new RuntimeException(s"$other is not a valid avro binary.") + } + updater.set(ordinal, bytes) + + case (FIXED, d: DecimalType) => (updater, ordinal, value) => + val bigDecimal = decimalConversions.fromFixed(value.asInstanceOf[GenericFixed], avroType, + LogicalTypes.decimal(d.precision, d.scale)) + val decimal = createDecimal(bigDecimal, d.precision, d.scale) + updater.setDecimal(ordinal, decimal) + + case (BYTES, d: DecimalType) => (updater, ordinal, value) => + val bigDecimal = decimalConversions.fromBytes(value.asInstanceOf[ByteBuffer], avroType, + LogicalTypes.decimal(d.precision, d.scale)) + val decimal = createDecimal(bigDecimal, d.precision, d.scale) + updater.setDecimal(ordinal, decimal) + + case (RECORD, st: StructType) => + val writeRecord = getRecordWriter(avroType, st, path) + (updater, ordinal, value) => + val row = new SpecificInternalRow(st) + writeRecord(new RowUpdater(row), value.asInstanceOf[GenericRecord]) + updater.set(ordinal, row) + + case (ARRAY, ArrayType(elementType, containsNull)) => + val elementWriter = newWriter(avroType.getElementType, elementType, path) + val elementPath = path :+ "element" + (updater, ordinal, value) => + val collection = value.asInstanceOf[java.util.Collection[Any]] + val len = collection.size() + val result = createArrayData(elementType, len) + val elementUpdater = new ArrayDataUpdater(result) + + var i = 0 + val iter = collection.iterator() + while (iter.hasNext) { + val element = iter.next() + if (element == null) { + if (!containsNull) { + throw new RuntimeException( + s"Array value at path '${elementPath.mkString(".")}' is not allowed to be null") + } else { + elementUpdater.setNullAt(i) + } + } else { + elementWriter(elementUpdater, i, element) + } + i += 1 + } + + updater.set(ordinal, result) + + case (MAP, MapType(keyType, valueType, valueContainsNull)) if keyType == StringType => + val keyWriter = newWriter(SchemaBuilder.builder().stringType(), StringType, path) + val valueWriter = newWriter(avroType.getValueType, valueType, path) + (updater, ordinal, value) => + val map = value.asInstanceOf[java.util.Map[AnyRef, AnyRef]] + val keyArray = createArrayData(keyType, map.size()) + val keyUpdater = new ArrayDataUpdater(keyArray) + val valueArray = createArrayData(valueType, map.size()) + val valueUpdater = new ArrayDataUpdater(valueArray) + val iter = map.entrySet().iterator() + var i = 0 + while (iter.hasNext) { + val entry = iter.next() + assert(entry.getKey != null) + keyWriter(keyUpdater, i, entry.getKey) + if (entry.getValue == null) { + if (!valueContainsNull) { + throw new RuntimeException(s"Map value at path ${path.mkString(".")} is not " + + "allowed to be null") + } else { + valueUpdater.setNullAt(i) + } + } else { + valueWriter(valueUpdater, i, entry.getValue) + } + i += 1 + } + + updater.set(ordinal, new ArrayBasedMapData(keyArray, valueArray)) + + case (UNION, _) => + val allTypes = avroType.getTypes.asScala + val nonNullTypes = allTypes.filter(_.getType != NULL) + val nonNullAvroType = Schema.createUnion(nonNullTypes.asJava) + if (nonNullTypes.nonEmpty) { + if (nonNullTypes.length == 1) { + newWriter(nonNullTypes.head, catalystType, path) + } else { + nonNullTypes.map(_.getType) match { + case Seq(a, b) if Set(a, b) == Set(INT, LONG) && catalystType == LongType => + (updater, ordinal, value) => value match { + case null => updater.setNullAt(ordinal) + case l: java.lang.Long => updater.setLong(ordinal, l) + case i: java.lang.Integer => updater.setLong(ordinal, i.longValue()) + } + + case Seq(a, b) if Set(a, b) == Set(FLOAT, DOUBLE) && catalystType == DoubleType => + (updater, ordinal, value) => value match { + case null => updater.setNullAt(ordinal) + case d: java.lang.Double => updater.setDouble(ordinal, d) + case f: java.lang.Float => updater.setDouble(ordinal, f.doubleValue()) + } + + case _ => + catalystType match { + case st: StructType if st.length == nonNullTypes.size => + val fieldWriters = nonNullTypes.zip(st.fields).map { + case (schema, field) => newWriter(schema, field.dataType, path :+ field.name) + }.toArray + (updater, ordinal, value) => { + val row = new SpecificInternalRow(st) + val fieldUpdater = new RowUpdater(row) + val i = GenericData.get().resolveUnion(nonNullAvroType, value) + fieldWriters(i)(fieldUpdater, i, value) + updater.set(ordinal, row) + } + + case _ => + throw new IncompatibleSchemaException( + s"Cannot convert Avro to catalyst because schema at path " + + s"${path.mkString(".")} is not compatible " + + s"(avroType = $avroType, sqlType = $catalystType).\n" + + s"Source Avro schema: $rootAvroType.\n" + + s"Target Catalyst type: $rootCatalystType") + } + } + } + } else { + (updater, ordinal, value) => updater.setNullAt(ordinal) + } + + case _ => + throw new IncompatibleSchemaException( + s"Cannot convert Avro to catalyst because schema at path ${path.mkString(".")} " + + s"is not compatible (avroType = $avroType, sqlType = $catalystType).\n" + + s"Source Avro schema: $rootAvroType.\n" + + s"Target Catalyst type: $rootCatalystType") + } + + // TODO: move the following method in Decimal object on creating Decimal from BigDecimal? + private def createDecimal(decimal: BigDecimal, precision: Int, scale: Int): Decimal = { + if (precision <= Decimal.MAX_LONG_DIGITS) { + // Constructs a `Decimal` with an unscaled `Long` value if possible. + Decimal(decimal.unscaledValue().longValue(), precision, scale) + } else { + // Otherwise, resorts to an unscaled `BigInteger` instead. + Decimal(decimal, precision, scale) + } + } + + private def getRecordWriter( + avroType: Schema, + sqlType: StructType, + path: List[String]): (CatalystDataUpdater, GenericRecord) => Unit = { + val validFieldIndexes = ArrayBuffer.empty[Int] + val fieldWriters = ArrayBuffer.empty[(CatalystDataUpdater, Any) => Unit] + + val length = sqlType.length + var i = 0 + while (i < length) { + val sqlField = sqlType.fields(i) + val avroField = avroType.getField(sqlField.name) + if (avroField != null) { + validFieldIndexes += avroField.pos() + + val baseWriter = newWriter(avroField.schema(), sqlField.dataType, path :+ sqlField.name) + val ordinal = i + val fieldWriter = (fieldUpdater: CatalystDataUpdater, value: Any) => { + if (value == null) { + fieldUpdater.setNullAt(ordinal) + } else { + baseWriter(fieldUpdater, ordinal, value) + } + } + fieldWriters += fieldWriter + } else if (!sqlField.nullable) { + throw new IncompatibleSchemaException( + s""" + |Cannot find non-nullable field ${path.mkString(".")}.${sqlField.name} in Avro schema. + |Source Avro schema: $rootAvroType. + |Target Catalyst type: $rootCatalystType. + """.stripMargin) + } + i += 1 + } + + (fieldUpdater, record) => { + var i = 0 + while (i < validFieldIndexes.length) { + fieldWriters(i)(fieldUpdater, record.get(validFieldIndexes(i))) + i += 1 + } + } + } + + private def createArrayData(elementType: DataType, length: Int): ArrayData = elementType match { + case BooleanType => UnsafeArrayData.fromPrimitiveArray(new Array[Boolean](length)) + case ByteType => UnsafeArrayData.fromPrimitiveArray(new Array[Byte](length)) + case ShortType => UnsafeArrayData.fromPrimitiveArray(new Array[Short](length)) + case IntegerType => UnsafeArrayData.fromPrimitiveArray(new Array[Int](length)) + case LongType => UnsafeArrayData.fromPrimitiveArray(new Array[Long](length)) + case FloatType => UnsafeArrayData.fromPrimitiveArray(new Array[Float](length)) + case DoubleType => UnsafeArrayData.fromPrimitiveArray(new Array[Double](length)) + case _ => new GenericArrayData(new Array[Any](length)) + } + + /** + * A base interface for updating values inside catalyst data structure like `InternalRow` and + * `ArrayData`. + */ + sealed trait CatalystDataUpdater { + def set(ordinal: Int, value: Any): Unit + + def setNullAt(ordinal: Int): Unit = set(ordinal, null) + def setBoolean(ordinal: Int, value: Boolean): Unit = set(ordinal, value) + def setByte(ordinal: Int, value: Byte): Unit = set(ordinal, value) + def setShort(ordinal: Int, value: Short): Unit = set(ordinal, value) + def setInt(ordinal: Int, value: Int): Unit = set(ordinal, value) + def setLong(ordinal: Int, value: Long): Unit = set(ordinal, value) + def setDouble(ordinal: Int, value: Double): Unit = set(ordinal, value) + def setFloat(ordinal: Int, value: Float): Unit = set(ordinal, value) + def setDecimal(ordinal: Int, value: Decimal): Unit = set(ordinal, value) + } + + final class RowUpdater(row: InternalRow) extends CatalystDataUpdater { + override def set(ordinal: Int, value: Any): Unit = row.update(ordinal, value) + + override def setNullAt(ordinal: Int): Unit = row.setNullAt(ordinal) + override def setBoolean(ordinal: Int, value: Boolean): Unit = row.setBoolean(ordinal, value) + override def setByte(ordinal: Int, value: Byte): Unit = row.setByte(ordinal, value) + override def setShort(ordinal: Int, value: Short): Unit = row.setShort(ordinal, value) + override def setInt(ordinal: Int, value: Int): Unit = row.setInt(ordinal, value) + override def setLong(ordinal: Int, value: Long): Unit = row.setLong(ordinal, value) + override def setDouble(ordinal: Int, value: Double): Unit = row.setDouble(ordinal, value) + override def setFloat(ordinal: Int, value: Float): Unit = row.setFloat(ordinal, value) + override def setDecimal(ordinal: Int, value: Decimal): Unit = + row.setDecimal(ordinal, value, value.precision) + } + + final class ArrayDataUpdater(array: ArrayData) extends CatalystDataUpdater { + override def set(ordinal: Int, value: Any): Unit = array.update(ordinal, value) + + override def setNullAt(ordinal: Int): Unit = array.setNullAt(ordinal) + override def setBoolean(ordinal: Int, value: Boolean): Unit = array.setBoolean(ordinal, value) + override def setByte(ordinal: Int, value: Byte): Unit = array.setByte(ordinal, value) + override def setShort(ordinal: Int, value: Short): Unit = array.setShort(ordinal, value) + override def setInt(ordinal: Int, value: Int): Unit = array.setInt(ordinal, value) + override def setLong(ordinal: Int, value: Long): Unit = array.setLong(ordinal, value) + override def setDouble(ordinal: Int, value: Double): Unit = array.setDouble(ordinal, value) + override def setFloat(ordinal: Int, value: Float): Unit = array.setFloat(ordinal, value) + override def setDecimal(ordinal: Int, value: Decimal): Unit = array.update(ordinal, value) + } +} diff --git a/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/avro/Spark2HoodieAvroDeserializer.scala b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/avro/Spark2HoodieAvroDeserializer.scala new file mode 100644 index 0000000000000..ac2c82f70dacf --- /dev/null +++ b/hudi-spark-datasource/hudi-spark2/src/main/scala/org/apache/spark/sql/avro/Spark2HoodieAvroDeserializer.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import org.apache.avro.Schema +import org.apache.spark.sql.types.DataType + +/** + * This is Spark 2 implementation for the [[HoodieAvroDeserializerTrait]] leveraging [[PatchedAvroDeserializer]], + * which is just copied over version of [[AvroDeserializer]] from Spark 2.4.4 w/ SPARK-30267 being back-ported to it + */ +class Spark2HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) + extends HoodieAvroDeserializerTrait { + + private val avroDeserializer = new PatchedAvroDeserializer(rootAvroType, rootCatalystType) + + def doDeserialize(data: Any): Any = avroDeserializer.deserialize(data) +} diff --git a/hudi-spark-datasource/hudi-spark3-common/pom.xml b/hudi-spark-datasource/hudi-spark3-common/pom.xml index affa987372963..30e7bda2e2eb9 100644 --- a/hudi-spark-datasource/hudi-spark3-common/pom.xml +++ b/hudi-spark-datasource/hudi-spark3-common/pom.xml @@ -168,6 +168,15 @@ org.apache.spark spark-sql_2.12 ${spark3.version} + provided + true + + + + org.apache.spark + spark-avro_2.12 + ${spark3.version} + provided true @@ -244,4 +253,4 @@ - \ No newline at end of file + diff --git a/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/HoodieSpark3SqlUtils.scala b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/HoodieSpark3SqlUtils.scala new file mode 100644 index 0000000000000..c4c6fd682df5f --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/HoodieSpark3SqlUtils.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, IdentityTransform, Transform} + +import scala.collection.mutable + +object HoodieSpark3SqlUtils { + def convertTransforms(partitions: Seq[Transform]): (Seq[String], Option[BucketSpec]) = { + val identityCols = new mutable.ArrayBuffer[String] + var bucketSpec = Option.empty[BucketSpec] + + partitions.map { + case IdentityTransform(FieldReference(Seq(col))) => + identityCols += col + + + case BucketTransform(numBuckets, FieldReference(Seq(col))) => + bucketSpec = Some(BucketSpec(numBuckets, col :: Nil, Nil)) + + case _ => + throw new HoodieException(s"Partitioning by expressions is not supported.") + } + + (identityCols, bucketSpec) + } +} diff --git a/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala index 61fcc9634f3f3..8f073bb1cdaaf 100644 --- a/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala +++ b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/adapter/Spark3Adapter.scala @@ -17,10 +17,11 @@ package org.apache.spark.sql.adapter +import org.apache.avro.Schema import org.apache.hudi.Spark3RowSerDe import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.spark3.internal.ReflectUtil -import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.avro.{HoodieAvroDeserializerTrait, HoodieAvroSerializerTrait, Spark3HoodieAvroDeserializer, HoodieAvroSerializer} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, Like} @@ -30,16 +31,24 @@ import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, Join, J import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation -import org.apache.spark.sql.execution.datasources.{LogicalRelation, Spark3ParsePartitionUtil, SparkParsePartitionUtil} import org.apache.spark.sql.hudi.SparkAdapter import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Row, SparkSession} /** * The adapter for spark3. */ class Spark3Adapter extends SparkAdapter { + def createAvroSerializer(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean): HoodieAvroSerializerTrait = + new HoodieAvroSerializer(rootCatalystType, rootAvroType, nullable) + + def createAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType): HoodieAvroDeserializerTrait = + new Spark3HoodieAvroDeserializer(rootAvroType, rootCatalystType) + override def createSparkRowSerDe(encoder: ExpressionEncoder[Row]): SparkRowSerDe = { new Spark3RowSerDe(encoder) } @@ -94,4 +103,24 @@ class Spark3Adapter extends SparkAdapter { override def parseMultipartIdentifier(parser: ParserInterface, sqlText: String): Seq[String] = { parser.parseMultipartIdentifier(sqlText) } + + /** + * Combine [[PartitionedFile]] to [[FilePartition]] according to `maxSplitBytes`. + */ + override def getFilePartitions( + sparkSession: SparkSession, + partitionedFiles: Seq[PartitionedFile], + maxSplitBytes: Long): Seq[FilePartition] = { + FilePartition.getFilePartitions(sparkSession, partitionedFiles, maxSplitBytes) + } + + override def isHoodieTable(table: LogicalPlan, spark: SparkSession): Boolean = { + tripAlias(table) match { + case LogicalRelation(_, _, Some(tbl), _) => isHoodieTable(tbl) + case relation: UnresolvedRelation => + isHoodieTable(toTableIdentifier(relation), spark) + case DataSourceV2Relation(table: Table, _, _, _, _) => isHoodieTable(table.properties()) + case _=> false + } + } } diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializer.scala b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/avro/Spark3HoodieAvroDeserializer.scala similarity index 67% rename from hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializer.scala rename to hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/avro/Spark3HoodieAvroDeserializer.scala index 1678dc05da4f8..fa03f5d841cfb 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/avro/HoodieAvroDeserializer.scala +++ b/hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/avro/Spark3HoodieAvroDeserializer.scala @@ -18,20 +18,15 @@ package org.apache.spark.sql.avro import org.apache.avro.Schema - import org.apache.hudi.HoodieSparkUtils - import org.apache.spark.sql.types.DataType -/** - * This is to be compatible with the type returned by Spark 3.1 - * and other spark versions for AvroDeserializer - */ -case class HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) { +class Spark3HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataType) + extends HoodieAvroDeserializerTrait { + // SPARK-34404: As of Spark3.2, there is no AvroDeserializer's constructor with Schema and DataType arguments. + // So use the reflection to get AvroDeserializer instance. private val avroDeserializer = if (HoodieSparkUtils.isSpark3_2) { - // SPARK-34404: As of Spark3.2, there is no AvroDeserializer's constructor with Schema and DataType arguments. - // So use the reflection to get AvroDeserializer instance. val constructor = classOf[AvroDeserializer].getConstructor(classOf[Schema], classOf[DataType], classOf[String]) constructor.newInstance(rootAvroType, rootCatalystType, "EXCEPTION") } else { @@ -39,10 +34,5 @@ case class HoodieAvroDeserializer(rootAvroType: Schema, rootCatalystType: DataTy constructor.newInstance(rootAvroType, rootCatalystType) } - def deserializeData(data: Any): Any = { - avroDeserializer.deserialize(data) match { - case Some(r) => r // As of spark 3.1, this will return data wrapped with Option, so we fetch the data. - case o => o // for other spark version, return the data directly. - } - } + def doDeserialize(data: Any): Any = avroDeserializer.deserialize(data) } diff --git a/hudi-spark-datasource/hudi-spark3/pom.xml b/hudi-spark-datasource/hudi-spark3/pom.xml index d8dba8384886c..722a1b4101241 100644 --- a/hudi-spark-datasource/hudi-spark3/pom.xml +++ b/hudi-spark-datasource/hudi-spark3/pom.xml @@ -158,6 +158,7 @@ org.apache.spark spark-sql_2.12 ${spark3.version} + provided true diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/hudi/Spark3DefaultSource.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/hudi/Spark3DefaultSource.scala index b553790878e42..d94fee1f410ae 100644 --- a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/hudi/Spark3DefaultSource.scala +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/hudi/Spark3DefaultSource.scala @@ -17,8 +17,30 @@ package org.apache.hudi +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.catalog.{Table, TableProvider} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.hudi.catalog.HoodieInternalV2Table import org.apache.spark.sql.sources.DataSourceRegister +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +class Spark3DefaultSource extends DefaultSource with DataSourceRegister with TableProvider { -class Spark3DefaultSource extends DefaultSource with DataSourceRegister { override def shortName(): String = "hudi" + + def inferSchema: StructType = new StructType() + + override def inferSchema(options: CaseInsensitiveStringMap): StructType = inferSchema + + override def getTable(schema: StructType, + partitioning: Array[Transform], + properties: java.util.Map[String, String]): Table = { + val options = new CaseInsensitiveStringMap(properties) + val path = options.get("path") + if (path == null) throw new HoodieException("'path' cannot be null, missing 'path' from table properties") + + HoodieInternalV2Table(SparkSession.active, path) + } } diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/connector/catalog/HoodieIdentifier.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/connector/catalog/HoodieIdentifier.scala new file mode 100644 index 0000000000000..2649c56e5a8a4 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/connector/catalog/HoodieIdentifier.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector.catalog + +import java.util +import java.util.Objects + +/** + * This class is to make scala-2.11 compilable. + * Using Identifier.of(namespace, name) to get a IdentifierImpl will throw + * compile exception( Static methods in interface require -target:jvm-1.8) + */ +case class HoodieIdentifier(namespace: Array[String], name: String) extends Identifier { + + override def equals(o: Any): Boolean = { + o match { + case that: HoodieIdentifier => util.Arrays.equals(namespace.asInstanceOf[Array[Object]], + that.namespace.asInstanceOf[Array[Object]]) && name == that.name + case _ => false + } + } + + override def hashCode: Int = { + val nh = namespace.toSeq.hashCode().asInstanceOf[Object] + Objects.hash(nh, name) + } +} diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieSpark3Analysis.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieSpark3Analysis.scala new file mode 100644 index 0000000000000..e20f934592e45 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodieSpark3Analysis.scala @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.analysis + +import org.apache.hudi.common.table.HoodieTableMetaClient +import org.apache.hudi.{DefaultSource, SparkAdapterSupport} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{ResolvedTable, UnresolvedPartitionSpec} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper +import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.execution.datasources.PreWriteCheck.failAnalysis +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, V2SessionCatalog} +import org.apache.spark.sql.hudi.{HoodieSqlCommonUtils, ProvidesHoodieConfig} +import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{castIfNeeded, getTableLocation, removeMetaFields, tableExistsInPath} +import org.apache.spark.sql.hudi.catalog.{HoodieCatalog, HoodieInternalV2Table} +import org.apache.spark.sql.hudi.command.{AlterHoodieTableDropPartitionCommand, ShowHoodieTablePartitionsCommand, TruncateHoodieTableCommand} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{AnalysisException, SQLContext, SparkSession} + +import scala.collection.JavaConverters.mapAsJavaMapConverter + +/** + * Rule for convert the logical plan to command. + * @param sparkSession + */ +case class HoodieSpark3Analysis(sparkSession: SparkSession) extends Rule[LogicalPlan] + with SparkAdapterSupport with ProvidesHoodieConfig { + + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown { + case dsv2 @ DataSourceV2Relation(d: HoodieInternalV2Table, _, _, _, _) => + val output = dsv2.output + val catalogTable = if (d.catalogTable.isDefined) { + Some(d.v1Table) + } else { + None + } + val relation = new DefaultSource().createRelation(new SQLContext(sparkSession), + buildHoodieConfig(d.hoodieCatalogTable)) + LogicalRelation(relation, output, catalogTable, isStreaming = false) + case a @ InsertIntoStatement(r: DataSourceV2Relation, partitionSpec, _, _, _, _) if a.query.resolved && + r.table.isInstanceOf[HoodieInternalV2Table] && + needsSchemaAdjustment(a.query, r.table.asInstanceOf[HoodieInternalV2Table], partitionSpec, r.schema) => + val projection = resolveQueryColumnsByOrdinal(a.query, r.output) + if (projection != a.query) { + a.copy(query = projection) + } else { + a + } + } + + /** + * Need to adjust schema based on the query and relation schema, for example, + * if using insert into xx select 1, 2 here need to map to column names + * @param query + * @param hoodieTable + * @param partitionSpec + * @param schema + * @return + */ + private def needsSchemaAdjustment(query: LogicalPlan, + hoodieTable: HoodieInternalV2Table, + partitionSpec: Map[String, Option[String]], + schema: StructType): Boolean = { + val output = query.output + val queryOutputWithoutMetaFields = removeMetaFields(output) + val partitionFields = hoodieTable.hoodieCatalogTable.partitionFields + val partitionSchema = hoodieTable.hoodieCatalogTable.partitionSchema + val staticPartitionValues = partitionSpec.filter(p => p._2.isDefined).mapValues(_.get) + + assert(staticPartitionValues.isEmpty || + staticPartitionValues.size == partitionSchema.size, + s"Required partition columns is: ${partitionSchema.json}, Current static partitions " + + s"is: ${staticPartitionValues.mkString("," + "")}") + + assert(staticPartitionValues.size + queryOutputWithoutMetaFields.size + == hoodieTable.hoodieCatalogTable.tableSchemaWithoutMetaFields.size, + s"Required select columns count: ${hoodieTable.hoodieCatalogTable.tableSchemaWithoutMetaFields.size}, " + + s"Current select columns(including static partition column) count: " + + s"${staticPartitionValues.size + queryOutputWithoutMetaFields.size},columns: " + + s"(${(queryOutputWithoutMetaFields.map(_.name) ++ staticPartitionValues.keys).mkString(",")})") + + // static partition insert. + if (staticPartitionValues.nonEmpty) { + // drop partition fields in origin schema to align fields. + schema.dropWhile(p => partitionFields.contains(p.name)) + } + + val existingSchemaOutput = output.take(schema.length) + existingSchemaOutput.map(_.name) != schema.map(_.name) || + existingSchemaOutput.map(_.dataType) != schema.map(_.dataType) + } + + private def resolveQueryColumnsByOrdinal(query: LogicalPlan, + targetAttrs: Seq[Attribute]): LogicalPlan = { + // always add a Cast. it will be removed in the optimizer if it is unnecessary. + val project = query.output.zipWithIndex.map { case (attr, i) => + if (i < targetAttrs.length) { + val targetAttr = targetAttrs(i) + val castAttr = castIfNeeded(attr.withNullability(targetAttr.nullable), targetAttr.dataType, conf) + Alias(castAttr, targetAttr.name)() + } else { + attr + } + } + Project(project, query) + } +} + +/** + * Rule for resolve hoodie's extended syntax or rewrite some logical plan. + * @param sparkSession + */ +case class HoodieSpark3ResolveReferences(sparkSession: SparkSession) extends Rule[LogicalPlan] + with SparkAdapterSupport with ProvidesHoodieConfig { + + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { + // Fill schema for Create Table without specify schema info + case c @ CreateV2Table(tableCatalog, tableName, schema, partitioning, properties, _) + if sparkAdapter.isHoodieTable(properties.asJava) => + + if (schema.isEmpty && partitioning.nonEmpty) { + failAnalysis("It is not allowed to specify partition columns when the table schema is " + + "not defined. When the table schema is not provided, schema and partition columns " + + "will be inferred.") + } + val hoodieCatalog = tableCatalog match { + case catalog: HoodieCatalog => catalog + case _ => tableCatalog.asInstanceOf[V2SessionCatalog] + } + val tablePath = getTableLocation(properties, + TableIdentifier(tableName.name(), tableName.namespace().lastOption), sparkSession) + + val tableExistInCatalog = hoodieCatalog.tableExists(tableName) + // Only when the table has not exist in catalog, we need to fill the schema info for creating table. + if (!tableExistInCatalog && tableExistsInPath(tablePath, sparkSession.sessionState.newHadoopConf())) { + val metaClient = HoodieTableMetaClient.builder() + .setBasePath(tablePath) + .setConf(sparkSession.sessionState.newHadoopConf()) + .build() + val tableSchema = HoodieSqlCommonUtils.getTableSqlSchema(metaClient) + if (tableSchema.isDefined && schema.isEmpty) { + // Fill the schema with the schema from the table + c.copy(tableSchema = tableSchema.get) + } else if (tableSchema.isDefined && schema != tableSchema.get) { + throw new AnalysisException(s"Specified schema in create table statement is not equal to the table schema." + + s"You should not specify the schema for an exist table: $tableName ") + } else { + c + } + } else { + c + } + case p => p + } +} + +/** + * Rule for rewrite some spark commands to hudi's implementation. + * @param sparkSession + */ +case class HoodieSpark3PostAnalysisRule(sparkSession: SparkSession) extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = { + plan match { + case ShowPartitions(child, specOpt, _) + if child.isInstanceOf[ResolvedTable] && + child.asInstanceOf[ResolvedTable].table.isInstanceOf[HoodieInternalV2Table] => + ShowHoodieTablePartitionsCommand(child.asInstanceOf[ResolvedTable].identifier.asTableIdentifier, specOpt.map(s => s.asInstanceOf[UnresolvedPartitionSpec].spec)) + + // Rewrite TruncateTableCommand to TruncateHoodieTableCommand + case TruncateTable(child) + if child.isInstanceOf[ResolvedTable] && + child.asInstanceOf[ResolvedTable].table.isInstanceOf[HoodieInternalV2Table] => + new TruncateHoodieTableCommand(child.asInstanceOf[ResolvedTable].identifier.asTableIdentifier, None) + + case DropPartitions(child, specs, ifExists, purge) + if child.resolved && child.isInstanceOf[ResolvedTable] && child.asInstanceOf[ResolvedTable].table.isInstanceOf[HoodieInternalV2Table] => + AlterHoodieTableDropPartitionCommand( + child.asInstanceOf[ResolvedTable].identifier.asTableIdentifier, + specs.seq.map(f => f.asInstanceOf[UnresolvedPartitionSpec]).map(s => s.spec), + ifExists, + purge, + retainData = true + ) + + case _ => plan + } + } +} diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/BasicStagedTable.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/BasicStagedTable.scala new file mode 100644 index 0000000000000..67d9e1ebb2bf8 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/BasicStagedTable.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.catalog + +import org.apache.hudi.exception.HoodieException +import org.apache.spark.sql.connector.catalog._ +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} +import org.apache.spark.sql.types.StructType + +import java.util + +/** + * Basic implementation that represents a table which is staged for being committed. + * @param ident table ident + * @param table table + * @param catalog table catalog + */ +case class BasicStagedTable(ident: Identifier, + table: Table, + catalog: TableCatalog) extends SupportsWrite with StagedTable { + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + info match { + case supportsWrite: SupportsWrite => supportsWrite.newWriteBuilder(info) + case _ => throw new HoodieException(s"Table `${ident.name}` does not support writes.") + } + } + + override def abortStagedChanges(): Unit = catalog.dropTable(ident) + + override def commitStagedChanges(): Unit = {} + + override def name(): String = ident.name() + + override def schema(): StructType = table.schema() + + override def partitioning(): Array[Transform] = table.partitioning() + + override def capabilities(): util.Set[TableCapability] = table.capabilities() + + override def properties(): util.Map[String, String] = table.properties() +} diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala new file mode 100644 index 0000000000000..3046af991404b --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.catalog + +import org.apache.hadoop.fs.Path +import org.apache.hudi.{DataSourceWriteOptions, SparkAdapterSupport} +import org.apache.hudi.client.common.HoodieSparkEngineContext +import org.apache.hudi.common.fs.FSUtils +import org.apache.hudi.exception.HoodieException +import org.apache.hudi.hive.util.ConfigUtils +import org.apache.hudi.sql.InsertMode +import org.apache.spark.sql.HoodieSpark3SqlUtils.convertTransforms +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, TableAlreadyExistsException, UnresolvedAttribute} +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, CatalogUtils, HoodieCatalogTable} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper +import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, ColumnChange, UpdateColumnComment, UpdateColumnType} +import org.apache.spark.sql.connector.catalog._ +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.execution.datasources.DataSource +import org.apache.spark.sql.hudi.command.{AlterHoodieTableAddColumnsCommand, AlterHoodieTableChangeColumnCommand, AlterHoodieTableRenameCommand, CreateHoodieTableCommand} +import org.apache.spark.sql.hudi.{HoodieSqlCommonUtils, ProvidesHoodieConfig} +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.{Dataset, SaveMode, SparkSession, _} + +import java.util +import scala.collection.JavaConverters.{mapAsJavaMapConverter, mapAsScalaMapConverter} + +class HoodieCatalog extends DelegatingCatalogExtension + with StagingTableCatalog + with SparkAdapterSupport + with ProvidesHoodieConfig { + + val spark: SparkSession = SparkSession.active + + override def stageCreate(ident: Identifier, schema: StructType, partitions: Array[Transform], properties: util.Map[String, String]): StagedTable = { + if (sparkAdapter.isHoodieTable(properties)) { + HoodieStagedTable(ident, this, schema, partitions, properties, TableCreationMode.STAGE_CREATE) + } else { + BasicStagedTable( + ident, + super.createTable(ident, schema, partitions, properties), + this) + } + } + + override def stageReplace(ident: Identifier, schema: StructType, partitions: Array[Transform], properties: util.Map[String, String]): StagedTable = { + if (sparkAdapter.isHoodieTable(properties)) { + HoodieStagedTable(ident, this, schema, partitions, properties, TableCreationMode.STAGE_REPLACE) + } else { + super.dropTable(ident) + BasicStagedTable( + ident, + super.createTable(ident, schema, partitions, properties), + this) + } + } + + override def stageCreateOrReplace(ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = { + if (sparkAdapter.isHoodieTable(properties)) { + HoodieStagedTable( + ident, this, schema, partitions, properties, TableCreationMode.CREATE_OR_REPLACE) + } else { + try super.dropTable(ident) catch { + case _: NoSuchTableException => // ignore the exception + } + BasicStagedTable( + ident, + super.createTable(ident, schema, partitions, properties), + this) + } + } + + override def loadTable(ident: Identifier): Table = { + try { + super.loadTable(ident) match { + case v1: V1Table if sparkAdapter.isHoodieTable(v1.catalogTable) => + HoodieInternalV2Table( + spark, + v1.catalogTable.location.toString, + catalogTable = Some(v1.catalogTable), + tableIdentifier = Some(ident.toString)) + case o => o + } + } catch { + case e: Exception => + throw e + } + } + + override def createTable(ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): Table = { + createHoodieTable(ident, schema, partitions, properties, Map.empty, Option.empty, TableCreationMode.CREATE) + } + + override def tableExists(ident: Identifier): Boolean = super.tableExists(ident) + + override def dropTable(ident: Identifier): Boolean = super.dropTable(ident) + + override def purgeTable(ident: Identifier): Boolean = { + val table = loadTable(ident) + table match { + case hoodieTable: HoodieInternalV2Table => + val location = hoodieTable.hoodieCatalogTable.tableLocation + val targetPath = new Path(location) + val engineContext = new HoodieSparkEngineContext(spark.sparkContext) + val fs = FSUtils.getFs(location, spark.sparkContext.hadoopConfiguration) + FSUtils.deleteDir(engineContext, fs, targetPath, spark.sparkContext.defaultParallelism) + super.dropTable(ident) + case _ => + } + true + } + + @throws[NoSuchTableException] + @throws[TableAlreadyExistsException] + override def renameTable(oldIdent: Identifier, newIdent: Identifier): Unit = { + loadTable(oldIdent) match { + case _: HoodieInternalV2Table => + new AlterHoodieTableRenameCommand(oldIdent.asTableIdentifier, newIdent.asTableIdentifier, false).run(spark) + case _ => super.renameTable(oldIdent, newIdent) + } + } + + override def alterTable(ident: Identifier, changes: TableChange*): Table = { + val tableIdent = TableIdentifier(ident.name(), ident.namespace().lastOption) + // scalastyle:off + val table = loadTable(ident) match { + case hoodieTable: HoodieInternalV2Table => hoodieTable + case _ => return super.alterTable(ident, changes: _*) + } + // scalastyle:on + + val grouped = changes.groupBy(c => c.getClass) + + grouped.foreach { + case (t, newColumns) if t == classOf[AddColumn] => + AlterHoodieTableAddColumnsCommand( + tableIdent, + newColumns.asInstanceOf[Seq[AddColumn]].map { col => + StructField( + col.fieldNames()(0), + col.dataType(), + col.isNullable) + }).run(spark) + case (t, columnChanges) if classOf[ColumnChange].isAssignableFrom(t) => + columnChanges.foreach { + case dataType: UpdateColumnType => + val colName = UnresolvedAttribute(dataType.fieldNames()).name + val newDataType = dataType.newDataType() + val structField = StructField(colName, newDataType) + AlterHoodieTableChangeColumnCommand(tableIdent, colName, structField).run(spark) + case dataType: UpdateColumnComment => + val newComment = dataType.newComment() + val colName = UnresolvedAttribute(dataType.fieldNames()).name + val fieldOpt = table.schema().findNestedField(dataType.fieldNames(), includeCollections = true, + spark.sessionState.conf.resolver).map(_._2) + val field = fieldOpt.getOrElse { + throw new AnalysisException( + s"Couldn't find column $colName in:\n${table.schema().treeString}") + } + AlterHoodieTableChangeColumnCommand(tableIdent, colName, field.withComment(newComment)).run(spark) + } + case (t, _) => + throw new UnsupportedOperationException(s"not supported table change: ${t.getClass}") + } + + loadTable(ident) + } + + def createHoodieTable(ident: Identifier, + schema: StructType, + partitions: Array[Transform], + allTableProperties: util.Map[String, String], + writeOptions: Map[String, String], + sourceQuery: Option[DataFrame], + operation: TableCreationMode): Table = { + + val (partitionColumns, maybeBucketSpec) = convertTransforms(partitions) + val newSchema = schema + val newPartitionColumns = partitionColumns + val newBucketSpec = maybeBucketSpec + + val isByPath = isPathIdentifier(ident) + + val location = if (isByPath) Option(ident.name()) else Option(allTableProperties.get("location")) + val id = ident.asTableIdentifier + + val locUriOpt = location.map(CatalogUtils.stringToURI) + val existingTableOpt = getExistingTableIfExists(id) + val loc = locUriOpt + .orElse(existingTableOpt.flatMap(_.storage.locationUri)) + .getOrElse(spark.sessionState.catalog.defaultTablePath(id)) + val storage = DataSource.buildStorageFormatFromOptions(writeOptions) + .copy(locationUri = Option(loc)) + val tableType = + if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED + val commentOpt = Option(allTableProperties.get("comment")) + + val tablePropertiesNew = new util.HashMap[String, String](allTableProperties) + // put path to table properties. + tablePropertiesNew.put("path", loc.getPath) + + val tableDesc = new CatalogTable( + identifier = id, + tableType = tableType, + storage = storage, + schema = newSchema, + provider = Option("hudi"), + partitionColumnNames = newPartitionColumns, + bucketSpec = newBucketSpec, + properties = tablePropertiesNew.asScala.toMap, + comment = commentOpt) + + val hoodieCatalogTable = HoodieCatalogTable(spark, tableDesc) + + if (operation == TableCreationMode.STAGE_CREATE) { + val tablePath = hoodieCatalogTable.tableLocation + val hadoopConf = spark.sessionState.newHadoopConf() + assert(HoodieSqlCommonUtils.isEmptyPath(tablePath, hadoopConf), + s"Path '$tablePath' should be empty for CTAS") + hoodieCatalogTable.initHoodieTable() + + val tblProperties = hoodieCatalogTable.catalogProperties + val options = Map( + DataSourceWriteOptions.HIVE_CREATE_MANAGED_TABLE.key -> (tableDesc.tableType == CatalogTableType.MANAGED).toString, + DataSourceWriteOptions.HIVE_TABLE_SERDE_PROPERTIES.key -> ConfigUtils.configToString(tblProperties.asJava), + DataSourceWriteOptions.HIVE_TABLE_PROPERTIES.key -> ConfigUtils.configToString(tableDesc.properties.asJava), + DataSourceWriteOptions.SQL_INSERT_MODE.key -> InsertMode.NON_STRICT.value(), + DataSourceWriteOptions.SQL_ENABLE_BULK_INSERT.key -> "true" + ) + saveSourceDF(sourceQuery, tableDesc.properties ++ buildHoodieInsertConfig(hoodieCatalogTable, spark, isOverwrite = false, Map.empty, options)) + CreateHoodieTableCommand.createTableInCatalog(spark, hoodieCatalogTable, ignoreIfExists = false) + } else if (sourceQuery.isEmpty) { + saveSourceDF(sourceQuery, tableDesc.properties) + new CreateHoodieTableCommand(tableDesc, false).run(spark) + } else { + saveSourceDF(sourceQuery, tableDesc.properties ++ buildHoodieInsertConfig(hoodieCatalogTable, spark, isOverwrite = false, Map.empty, Map.empty)) + new CreateHoodieTableCommand(tableDesc, false).run(spark) + } + + loadTable(ident) + } + + private def isPathIdentifier(ident: Identifier) = new Path(ident.name()).isAbsolute + + protected def isPathIdentifier(table: CatalogTable): Boolean = { + isPathIdentifier(table.identifier) + } + + protected def isPathIdentifier(tableIdentifier: TableIdentifier): Boolean = { + isPathIdentifier(HoodieIdentifier(tableIdentifier.database.toArray, tableIdentifier.table)) + } + + private def getExistingTableIfExists(table: TableIdentifier): Option[CatalogTable] = { + // If this is a path identifier, we cannot return an existing CatalogTable. The Create command + // will check the file system itself + val catalog = spark.sessionState.catalog + // scalastyle:off + if (isPathIdentifier(table)) return None + // scalastyle:on + val tableExists = catalog.tableExists(table) + if (tableExists) { + val oldTable = catalog.getTableMetadata(table) + if (oldTable.tableType == CatalogTableType.VIEW) throw new HoodieException( + s"$table is a view. You may not write data into a view.") + if (!sparkAdapter.isHoodieTable(oldTable)) throw new HoodieException(s"$table is not a Hoodie table.") + Some(oldTable) + } else None + } + + private def saveSourceDF(sourceQuery: Option[Dataset[_]], + properties: Map[String, String]): Unit = { + sourceQuery.map(df => { + df.write.format("org.apache.hudi") + .options(properties) + .mode(SaveMode.Append) + .save() + df + }) + } +} diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieInternalV2Table.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieInternalV2Table.scala new file mode 100644 index 0000000000000..848925aafe417 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieInternalV2Table.scala @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.catalog + +import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HoodieCatalogTable} +import org.apache.spark.sql.connector.catalog.TableCapability._ +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability, V2TableWithV1Fallback} +import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform, Transform} +import org.apache.spark.sql.connector.write._ +import org.apache.spark.sql.hudi.ProvidesHoodieConfig +import org.apache.spark.sql.sources.{Filter, InsertableRelation} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} + +import java.util +import scala.collection.JavaConverters.{mapAsJavaMapConverter, setAsJavaSetConverter} + +case class HoodieInternalV2Table(spark: SparkSession, + path: String, + catalogTable: Option[CatalogTable] = None, + tableIdentifier: Option[String] = None, + options: CaseInsensitiveStringMap = CaseInsensitiveStringMap.empty()) + extends Table with SupportsWrite with V2TableWithV1Fallback { + + lazy val hoodieCatalogTable: HoodieCatalogTable = if (catalogTable.isDefined) { + HoodieCatalogTable(spark, catalogTable.get) + } else { + val metaClient: HoodieTableMetaClient = HoodieTableMetaClient.builder() + .setBasePath(path) + .setConf(SparkSession.active.sessionState.newHadoopConf) + .build() + + val tableConfig: HoodieTableConfig = metaClient.getTableConfig + val tableName: String = tableConfig.getTableName + + HoodieCatalogTable(spark, TableIdentifier(tableName)) + } + + private lazy val tableSchema: StructType = hoodieCatalogTable.tableSchema + + override def name(): String = hoodieCatalogTable.table.identifier.unquotedString + + override def schema(): StructType = tableSchema + + override def capabilities(): util.Set[TableCapability] = Set( + BATCH_READ, V1_BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE, ACCEPT_ANY_SCHEMA + ).asJava + + override def properties(): util.Map[String, String] = { + hoodieCatalogTable.catalogProperties.asJava + } + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + new HoodieV1WriteBuilder(info.options, hoodieCatalogTable, spark) + } + + override def v1Table: CatalogTable = hoodieCatalogTable.table + + override def partitioning(): Array[Transform] = { + hoodieCatalogTable.partitionFields.map { col => + new IdentityTransform(new FieldReference(Seq(col))) + }.toArray + } + +} + +private class HoodieV1WriteBuilder(writeOptions: CaseInsensitiveStringMap, + hoodieCatalogTable: HoodieCatalogTable, + spark: SparkSession) + extends SupportsTruncate with SupportsOverwrite with ProvidesHoodieConfig { + + private var forceOverwrite = false + + override def truncate(): HoodieV1WriteBuilder = { + forceOverwrite = true + this + } + + override def overwrite(filters: Array[Filter]): WriteBuilder = { + forceOverwrite = true + this + } + + override def build(): V1Write = new V1Write { + override def toInsertableRelation: InsertableRelation = { + new InsertableRelation { + override def insert(data: DataFrame, overwrite: Boolean): Unit = { + val mode = if (forceOverwrite && hoodieCatalogTable.partitionFields.isEmpty) { + // insert overwrite non-partition table + SaveMode.Overwrite + } else { + // for insert into or insert overwrite partition we use append mode. + SaveMode.Append + } + alignOutputColumns(data).write.format("org.apache.hudi") + .mode(mode) + .options(buildHoodieConfig(hoodieCatalogTable) ++ + buildHoodieInsertConfig(hoodieCatalogTable, spark, forceOverwrite, Map.empty, Map.empty)) + .save() + } + } + } + } + + private def alignOutputColumns(data: DataFrame): DataFrame = { + val schema = hoodieCatalogTable.tableSchema + spark.createDataFrame(data.toJavaRDD, schema) + } +} diff --git a/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieStagedTable.scala b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieStagedTable.scala new file mode 100644 index 0000000000000..4034862167aa5 --- /dev/null +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieStagedTable.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.catalog + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hudi.DataSourceWriteOptions.RECORDKEY_FIELD +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.connector.catalog.{Identifier, StagedTable, SupportsWrite, TableCapability} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, V1Write, WriteBuilder} +import org.apache.spark.sql.sources.InsertableRelation +import org.apache.spark.sql.types.StructType + +import java.util +import scala.collection.JavaConverters.{mapAsScalaMapConverter, setAsJavaSetConverter} + +case class HoodieStagedTable(ident: Identifier, + catalog: HoodieCatalog, + override val schema: StructType, + partitions: Array[Transform], + override val properties: util.Map[String, String], + mode: TableCreationMode) extends StagedTable with SupportsWrite { + + private var sourceQuery: Option[DataFrame] = None + private var writeOptions: Map[String, String] = Map.empty + + override def commitStagedChanges(): Unit = { + val props = new util.HashMap[String, String]() + val optionsThroughProperties = properties.asScala.collect { + case (k, _) if k.startsWith("option.") => k.stripPrefix("option.") + }.toSet + val sqlWriteOptions = new util.HashMap[String, String]() + properties.asScala.foreach { case (k, v) => + if (!k.startsWith("option.") && !optionsThroughProperties.contains(k)) { + props.put(k, v) + } else if (optionsThroughProperties.contains(k)) { + sqlWriteOptions.put(k, v) + } + } + if (writeOptions.isEmpty && !sqlWriteOptions.isEmpty) { + writeOptions = sqlWriteOptions.asScala.toMap + } + props.putAll(properties) + props.put("hoodie.table.name", ident.name()) + props.put(RECORDKEY_FIELD.key, properties.get("primaryKey")) + catalog.createHoodieTable(ident, schema, partitions, props, writeOptions, sourceQuery, mode) + } + + override def name(): String = ident.name() + + override def abortStagedChanges(): Unit = { + clearTablePath(properties.get("location"), catalog.spark.sparkContext.hadoopConfiguration) + } + + private def clearTablePath(tablePath: String, conf: Configuration): Unit = { + val path = new Path(tablePath) + val fs = path.getFileSystem(conf) + fs.delete(path, true) + } + + override def capabilities(): util.Set[TableCapability] = Set(TableCapability.V1_BATCH_WRITE).asJava + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + writeOptions = info.options.asCaseSensitiveMap().asScala.toMap + new HoodieV1WriteBuilder + } + + /* + * WriteBuilder for creating a Hoodie table. + */ + private class HoodieV1WriteBuilder extends WriteBuilder { + override def build(): V1Write = new V1Write { + override def toInsertableRelation(): InsertableRelation = { + new InsertableRelation { + override def insert(data: DataFrame, overwrite: Boolean): Unit = { + sourceQuery = Option(data) + } + } + } + } + } + +} diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/MapColumnVector.java b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/TableCreationMode.java similarity index 71% rename from hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/MapColumnVector.java rename to hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/TableCreationMode.java index 38424dad7d3a7..8b54775be149e 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/MapColumnVector.java +++ b/hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/hudi/catalog/TableCreationMode.java @@ -1,29 +1,23 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hudi.table.format.cow.vector; - -import org.apache.flink.table.data.MapData; -import org.apache.flink.table.data.vector.ColumnVector; - -/** - * Map column vector. - */ -public interface MapColumnVector extends ColumnVector { - MapData getMap(int i); -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hudi.catalog; + +public enum TableCreationMode { + CREATE, CREATE_OR_REPLACE, STAGE_CREATE, STAGE_REPLACE +} diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java index 8dc3c6e0e468f..50991852b2c3b 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java @@ -49,6 +49,9 @@ public class HiveSyncConfig implements Serializable { @Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url") public String jdbcUrl; + @Parameter(names = {"--metastore-uris"}, description = "Hive metastore uris") + public String metastoreUris; + @Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true) public String basePath; @@ -126,6 +129,9 @@ public class HiveSyncConfig implements Serializable { @Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.") public Boolean isConditionalSync = false; + @Parameter(names = {"--spark-version"}, description = "The spark version", required = false) + public String sparkVersion; + // enhance the similar function in child class public static HiveSyncConfig copy(HiveSyncConfig cfg) { HiveSyncConfig newConfig = new HiveSyncConfig(); @@ -137,6 +143,7 @@ public static HiveSyncConfig copy(HiveSyncConfig cfg) { newConfig.partitionFields = cfg.partitionFields; newConfig.partitionValueExtractorClass = cfg.partitionValueExtractorClass; newConfig.jdbcUrl = cfg.jdbcUrl; + newConfig.metastoreUris = cfg.metastoreUris; newConfig.tableName = cfg.tableName; newConfig.bucketSpec = cfg.bucketSpec; newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat; @@ -151,6 +158,7 @@ public static HiveSyncConfig copy(HiveSyncConfig cfg) { newConfig.sparkSchemaLengthThreshold = cfg.sparkSchemaLengthThreshold; newConfig.withOperationField = cfg.withOperationField; newConfig.isConditionalSync = cfg.isConditionalSync; + newConfig.sparkVersion = cfg.sparkVersion; return newConfig; } @@ -164,6 +172,7 @@ public String toString() { + ", hiveUser='" + hiveUser + '\'' + ", hivePass='" + hivePass + '\'' + ", jdbcUrl='" + jdbcUrl + '\'' + + ", metastoreUris='" + metastoreUris + '\'' + ", basePath='" + basePath + '\'' + ", partitionFields=" + partitionFields + ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\'' diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index b37b28ed27636..35200216ee9c0 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.InvalidTableException; import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils; @@ -302,6 +303,9 @@ private Map getSparkTableProperties(int schemaLengthThreshold, M Map sparkProperties = new HashMap<>(); sparkProperties.put("spark.sql.sources.provider", "hudi"); + if (!StringUtils.isNullOrEmpty(cfg.sparkVersion)) { + sparkProperties.put("spark.sql.create.version", cfg.sparkVersion); + } // Split the schema string to multi-parts according the schemaLengthThreshold size. String schemaString = Parquet2SparkSchemaUtils.convertToSparkSchemaJson(reOrderedType); int numSchemaPart = (schemaString.length() + schemaLengthThreshold - 1) / schemaLengthThreshold; diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java index 4b92b252cb0c8..e66bb7c914645 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/testutils/HiveTestUtil.java @@ -28,6 +28,7 @@ import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieDeltaWriteStat; import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieWriteStat; @@ -428,7 +429,7 @@ private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLog Map header = new HashMap<>(2); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, dataFile.getCommitTime()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); - HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); + HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD); logWriter.appendBlock(dataBlock); logWriter.close(); return logWriter.getLogFile(); diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java index 98b11f2f37cc4..1815491f1867e 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java @@ -18,6 +18,8 @@ package org.apache.hudi.sync.common; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hudi.common.engine.HoodieLocalEngineContext; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; @@ -29,9 +31,6 @@ import org.apache.hudi.common.table.timeline.TimelineUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ValidationUtils; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.parquet.schema.MessageType; @@ -149,11 +148,7 @@ public void closeQuietly(ResultSet resultSet, Statement stmt) { */ public MessageType getDataSchema() { try { - if (withOperationField) { - return new TableSchemaResolver(metaClient, true).getTableParquetSchema(); - } else { - return new TableSchemaResolver(metaClient).getTableParquetSchema(); - } + return new TableSchemaResolver(metaClient).getTableParquetSchema(); } catch (Exception e) { throw new HoodieSyncException("Failed to read data schema", e); } @@ -162,11 +157,7 @@ public MessageType getDataSchema() { public boolean isDropPartition() { try { Option hoodieCommitMetadata; - if (withOperationField) { - hoodieCommitMetadata = new TableSchemaResolver(metaClient, true).getLatestCommitMetadata(); - } else { - hoodieCommitMetadata = new TableSchemaResolver(metaClient).getLatestCommitMetadata(); - } + hoodieCommitMetadata = new TableSchemaResolver(metaClient).getLatestCommitMetadata(); if (hoodieCommitMetadata.isPresent() && WriteOperationType.DELETE_PARTITION.equals(hoodieCommitMetadata.get().getOperationType())) { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HDFSParquetImporter.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HDFSParquetImporter.java index 8651e30c044c2..b5d7dc4b107dd 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HDFSParquetImporter.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HDFSParquetImporter.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -197,7 +198,7 @@ protected JavaRDD> buildHoodieRecordsForImport LOG.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")"); } } - return new HoodieRecord<>(new HoodieKey(rowField.toString(), partitionPath), + return new HoodieAvroRecord<>(new HoodieKey(rowField.toString(), partitionPath), new HoodieJsonPayload(genericRecord.toString())); }); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieClusteringJob.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieClusteringJob.java index a4ee8089f8316..26639628eab1b 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieClusteringJob.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieClusteringJob.java @@ -23,7 +23,6 @@ import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; @@ -49,7 +48,6 @@ import java.util.ArrayList; import java.util.Date; import java.util.List; -import java.util.stream.Collectors; public class HoodieClusteringJob { @@ -189,11 +187,11 @@ public int cluster(int retry) { } private String getSchemaFromLatestInstant() throws Exception { - TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); if (metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 0) { throw new HoodieException("Cannot run clustering without any completed commits"); } - Schema schema = schemaUtil.getTableAvroSchema(false); + Schema schema = schemaResolver.getTableAvroSchema(false); return schema.toString(); } @@ -216,7 +214,7 @@ private int doCluster(JavaSparkContext jsc) throws Exception { } Option commitMetadata = client.cluster(cfg.clusteringInstantTime, true).getCommitMetadata(); - return handleErrors(commitMetadata.get(), cfg.clusteringInstantTime); + return UtilHelpers.handleErrors(commitMetadata.get(), cfg.clusteringInstantTime); } } @@ -271,20 +269,7 @@ private int doScheduleAndCluster(JavaSparkContext jsc) throws Exception { LOG.info("The schedule instant time is " + instantTime.get()); LOG.info("Step 2: Do cluster"); Option metadata = client.cluster(instantTime.get(), true).getCommitMetadata(); - return handleErrors(metadata.get(), instantTime.get()); + return UtilHelpers.handleErrors(metadata.get(), instantTime.get()); } } - - private int handleErrors(HoodieCommitMetadata metadata, String instantTime) { - List writeStats = metadata.getPartitionToWriteStats().entrySet().stream().flatMap(e -> - e.getValue().stream()).collect(Collectors.toList()); - long errorsCount = writeStats.stream().mapToLong(HoodieWriteStat::getTotalWriteErrors).sum(); - if (errorsCount == 0) { - LOG.info(String.format("Table imported into hoodie with %s instant time.", instantTime)); - return 0; - } - - LOG.error(String.format("Import failed with %d errors.", errorsCount)); - return -1; - } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java index 706d1d9df4b9e..ce2be7d5038dc 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieCompactor.java @@ -18,13 +18,14 @@ package org.apache.hudi.utilities; +import org.apache.avro.Schema; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.common.config.TypedProperties; -import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; @@ -35,6 +36,10 @@ import com.beust.jcommander.Parameter; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hudi.exception.HoodieException; + +import org.apache.hudi.table.action.HoodieWriteMetadata; +import org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; @@ -43,15 +48,19 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; +import java.util.Objects; public class HoodieCompactor { private static final Logger LOG = LogManager.getLogger(HoodieCompactor.class); - private static ConsistencyGuardConfig consistencyGuardConfig = ConsistencyGuardConfig.newBuilder().build(); + public static final String EXECUTE = "execute"; + public static final String SCHEDULE = "schedule"; + public static final String SCHEDULE_AND_EXECUTE = "scheduleandexecute"; private final Config cfg; private transient FileSystem fs; private TypedProperties props; private final JavaSparkContext jsc; + private final HoodieTableMetaClient metaClient; public HoodieCompactor(JavaSparkContext jsc, Config cfg) { this.cfg = cfg; @@ -59,6 +68,7 @@ public HoodieCompactor(JavaSparkContext jsc, Config cfg) { this.props = cfg.propsFilePath == null ? UtilHelpers.buildProperties(cfg.configs) : readConfigFromFileSystem(jsc, cfg); + this.metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true); } private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) { @@ -73,9 +83,9 @@ public static class Config implements Serializable { public String tableName = null; @Parameter(names = {"--instant-time", "-it"}, description = "Compaction Instant time", required = false) public String compactionInstantTime = null; - @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for hoodie insert", required = true) - public int parallelism = 1; - @Parameter(names = {"--schema-file", "-sf"}, description = "path for Avro schema file", required = true) + @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for hoodie insert", required = false) + public int parallelism = 200; + @Parameter(names = {"--schema-file", "-sf"}, description = "path for Avro schema file", required = false) public String schemaFile = null; @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) public String sparkMaster = null; @@ -85,8 +95,12 @@ public static class Config implements Serializable { public int retry = 0; @Parameter(names = {"--schedule", "-sc"}, description = "Schedule compaction", required = false) public Boolean runSchedule = false; + @Parameter(names = {"--mode", "-m"}, description = "Set job mode: Set \"schedule\" means make a compact plan; " + + "Set \"execute\" means execute a compact plan at given instant which means --instant-time is needed here; " + + "Set \"scheduleAndExecute\" means make a compact plan first and execute that plan immediately", required = false) + public String runningMode = null; @Parameter(names = {"--strategy", "-st"}, description = "Strategy Class", required = false) - public String strategyClassName = null; + public String strategyClassName = LogFileSizeBasedCompactionStrategy.class.getName(); @Parameter(names = {"--help", "-h"}, help = true) public Boolean help = false; @@ -96,8 +110,57 @@ public static class Config implements Serializable { @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated", - splitter = IdentitySplitter.class) + splitter = IdentitySplitter.class) public List configs = new ArrayList<>(); + + @Override + public String toString() { + return "HoodieCompactorConfig {\n" + + " --base-path " + basePath + ", \n" + + " --table-name " + tableName + ", \n" + + " --instant-time " + compactionInstantTime + ", \n" + + " --parallelism " + parallelism + ", \n" + + " --schema-file " + schemaFile + ", \n" + + " --spark-master " + sparkMaster + ", \n" + + " --spark-memory " + sparkMemory + ", \n" + + " --retry " + retry + ", \n" + + " --schedule " + runSchedule + ", \n" + + " --mode " + runningMode + ", \n" + + " --strategy " + strategyClassName + ", \n" + + " --props " + propsFilePath + ", \n" + + " --hoodie-conf " + configs + + "\n}"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Config config = (Config) o; + return basePath.equals(config.basePath) + && Objects.equals(tableName, config.tableName) + && Objects.equals(compactionInstantTime, config.compactionInstantTime) + && Objects.equals(parallelism, config.parallelism) + && Objects.equals(schemaFile, config.schemaFile) + && Objects.equals(sparkMaster, config.sparkMaster) + && Objects.equals(sparkMemory, config.sparkMemory) + && Objects.equals(retry, config.retry) + && Objects.equals(runSchedule, config.runSchedule) + && Objects.equals(runningMode, config.runningMode) + && Objects.equals(strategyClassName, config.strategyClassName) + && Objects.equals(propsFilePath, config.propsFilePath) + && Objects.equals(configs, config.configs); + } + + @Override + public int hashCode() { + return Objects.hash(basePath, tableName, compactionInstantTime, schemaFile, + sparkMaster, parallelism, sparkMemory, retry, runSchedule, runningMode, strategyClassName, propsFilePath, configs, help); + } } public static void main(String[] args) { @@ -120,52 +183,115 @@ public static void main(String[] args) { public int compact(int retry) { this.fs = FSUtils.getFs(cfg.basePath, jsc.hadoopConfiguration()); + // need to do validate in case that users call compact() directly without setting cfg.runningMode + validateRunningMode(cfg); + LOG.info(cfg); + int ret = UtilHelpers.retry(retry, () -> { - if (cfg.runSchedule) { - if (null == cfg.strategyClassName) { - throw new IllegalArgumentException("Missing Strategy class name for running compaction"); + switch (cfg.runningMode.toLowerCase()) { + case SCHEDULE: { + LOG.info("Running Mode: [" + SCHEDULE + "]; Do schedule"); + Option instantTime = doSchedule(jsc); + int result = instantTime.isPresent() ? 0 : -1; + if (result == 0) { + LOG.info("The schedule instant time is " + instantTime.get()); + } + return result; + } + case SCHEDULE_AND_EXECUTE: { + LOG.info("Running Mode: [" + SCHEDULE_AND_EXECUTE + "]"); + return doScheduleAndCompact(jsc); + } + case EXECUTE: { + LOG.info("Running Mode: [" + EXECUTE + "]; Do compaction"); + return doCompact(jsc); + } + default: { + LOG.info("Unsupported running mode [" + cfg.runningMode + "], quit the job directly"); + return -1; } - return doSchedule(jsc); - } else { - return doCompact(jsc); } }, "Compact failed"); return ret; } + private Integer doScheduleAndCompact(JavaSparkContext jsc) throws Exception { + LOG.info("Step 1: Do schedule"); + Option instantTime = doSchedule(jsc); + if (!instantTime.isPresent()) { + LOG.warn("Couldn't do schedule"); + return -1; + } else { + cfg.compactionInstantTime = instantTime.get(); + } + + LOG.info("The schedule instant time is " + instantTime.get()); + LOG.info("Step 2: Do compaction"); + + return doCompact(jsc); + } + + // make sure that cfg.runningMode couldn't be null + private static void validateRunningMode(Config cfg) { + // --mode has a higher priority than --schedule + // If we remove --schedule option in the future we need to change runningMode default value to EXECUTE + if (StringUtils.isNullOrEmpty(cfg.runningMode)) { + cfg.runningMode = cfg.runSchedule ? SCHEDULE : EXECUTE; + } + } + private int doCompact(JavaSparkContext jsc) throws Exception { // Get schema. - String schemaStr = UtilHelpers.parseSchema(fs, cfg.schemaFile); - SparkRDDWriteClient client = - UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props); - // If no compaction instant is provided by --instant-time, find the earliest scheduled compaction - // instant from the active timeline - if (StringUtils.isNullOrEmpty(cfg.compactionInstantTime)) { - HoodieTableMetaClient metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true); - Option firstCompactionInstant = - metaClient.getActiveTimeline().firstInstant( - HoodieTimeline.COMPACTION_ACTION, HoodieInstant.State.REQUESTED); - if (firstCompactionInstant.isPresent()) { - cfg.compactionInstantTime = firstCompactionInstant.get().getTimestamp(); - LOG.info("Found the earliest scheduled compaction instant which will be executed: " - + cfg.compactionInstantTime); - } else { - throw new HoodieCompactionException("There is no scheduled compaction in the table."); + String schemaStr; + if (StringUtils.isNullOrEmpty(cfg.schemaFile)) { + schemaStr = getSchemaFromLatestInstant(); + } else { + schemaStr = UtilHelpers.parseSchema(fs, cfg.schemaFile); + } + LOG.info("Schema --> : " + schemaStr); + + try (SparkRDDWriteClient client = + UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) { + // If no compaction instant is provided by --instant-time, find the earliest scheduled compaction + // instant from the active timeline + if (StringUtils.isNullOrEmpty(cfg.compactionInstantTime)) { + HoodieTableMetaClient metaClient = UtilHelpers.createMetaClient(jsc, cfg.basePath, true); + Option firstCompactionInstant = + metaClient.getActiveTimeline().firstInstant( + HoodieTimeline.COMPACTION_ACTION, HoodieInstant.State.REQUESTED); + if (firstCompactionInstant.isPresent()) { + cfg.compactionInstantTime = firstCompactionInstant.get().getTimestamp(); + LOG.info("Found the earliest scheduled compaction instant which will be executed: " + + cfg.compactionInstantTime); + } else { + throw new HoodieCompactionException("There is no scheduled compaction in the table."); + } } + HoodieWriteMetadata> compactionMetadata = client.compact(cfg.compactionInstantTime); + return UtilHelpers.handleErrors(compactionMetadata.getCommitMetadata().get(), cfg.compactionInstantTime); } - JavaRDD writeResponse = client.compact(cfg.compactionInstantTime); - return UtilHelpers.handleErrors(jsc, cfg.compactionInstantTime, writeResponse); } - private int doSchedule(JavaSparkContext jsc) throws Exception { - // Get schema. - SparkRDDWriteClient client = - UtilHelpers.createHoodieClient(jsc, cfg.basePath, "", cfg.parallelism, Option.of(cfg.strategyClassName), props); - if (StringUtils.isNullOrEmpty(cfg.compactionInstantTime)) { - throw new IllegalArgumentException("No instant time is provided for scheduling compaction. " - + "Please specify the compaction instant time by using --instant-time."); + private Option doSchedule(JavaSparkContext jsc) { + try (SparkRDDWriteClient client = + UtilHelpers.createHoodieClient(jsc, cfg.basePath, "", cfg.parallelism, Option.of(cfg.strategyClassName), props)) { + + if (StringUtils.isNullOrEmpty(cfg.compactionInstantTime)) { + LOG.warn("No instant time is provided for scheduling compaction."); + return client.scheduleCompaction(Option.empty()); + } + + client.scheduleCompactionAtInstant(cfg.compactionInstantTime, Option.empty()); + return Option.of(cfg.compactionInstantTime); + } + } + + private String getSchemaFromLatestInstant() throws Exception { + TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); + if (metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 0) { + throw new HoodieException("Cannot run compaction without any completed commits"); } - client.scheduleCompactionAtInstant(cfg.compactionInstantTime, Option.empty()); - return 0; + Schema schema = schemaUtil.getTableAvroSchema(false); + return schema.toString(); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDataTableUtils.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDataTableUtils.java new file mode 100644 index 0000000000000..755a203d17933 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDataTableUtils.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.metadata.HoodieTableMetadata; + +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +public class HoodieDataTableUtils { + + /** + * @return All hoodie files of the table from the file system. + * @throws IOException upon errors. + */ + static List getBaseAndLogFilePathsFromFileSystem(HoodieTableMetadata tableMetadata, String basePath) throws IOException { + List allPartitionPaths = tableMetadata.getAllPartitionPaths() + .stream().map(partitionPath -> + FSUtils.getPartitionPath(basePath, partitionPath).toString()) + .collect(Collectors.toList()); + return tableMetadata.getAllFilesInPartitions(allPartitionPaths).values().stream() + .map(fileStatuses -> + Arrays.stream(fileStatuses).map(fileStatus -> fileStatus.getPath()).collect(Collectors.toList())) + .flatMap(list -> list.stream()) + .collect(Collectors.toList()); + } + +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDataTableValidator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDataTableValidator.java new file mode 100644 index 0000000000000..0180fa0af1590 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDataTableValidator.java @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.async.HoodieAsyncService; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.metadata.FileSystemBackedTableMetadata; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.table.repair.RepairUtils; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * A validator with spark-submit to ensure there are no dangling data files in the data table. + * No data files found for commits prior to active timeline. + * No extra data files found for completed commits more than whats present in commit metadata. + * + *

    + * - Default : This validator will validate the data files only once. + *

    + * Example command: + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieDataTableValidator \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master spark://xxxx:7077 \ + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath + * ``` + * + *

    + * Also You can set `--continuous` for long running this validator. + * And use `--min-validate-interval-seconds` to control the validation frequency, default is 10 minutes. + *

    + * Example command: + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieDataTableValidator \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master spark://xxxx:7077 \ + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath + * --continuous \ + * --min-validate-interval-seconds 60 + * ``` + */ +public class HoodieDataTableValidator implements Serializable { + + private static final Logger LOG = LogManager.getLogger(HoodieDataTableValidator.class); + + // Spark context + private transient JavaSparkContext jsc; + // config + private Config cfg; + // Properties with source, hoodie client, key generator etc. + private TypedProperties props; + + private HoodieTableMetaClient metaClient; + + protected transient Option asyncDataTableValidateService; + + public HoodieDataTableValidator(HoodieTableMetaClient metaClient) { + this.metaClient = metaClient; + } + + public HoodieDataTableValidator(JavaSparkContext jsc, Config cfg) { + this.jsc = jsc; + this.cfg = cfg; + + this.props = cfg.propsFilePath == null + ? UtilHelpers.buildProperties(cfg.configs) + : readConfigFromFileSystem(jsc, cfg); + + this.metaClient = HoodieTableMetaClient.builder() + .setConf(jsc.hadoopConfiguration()).setBasePath(cfg.basePath) + .setLoadActiveTimelineOnLoad(true) + .build(); + + this.asyncDataTableValidateService = cfg.continuous ? Option.of(new AsyncDataTableValidateService()) : Option.empty(); + } + + /** + * Reads config from the file system. + * + * @param jsc {@link JavaSparkContext} instance. + * @param cfg {@link Config} instance. + * @return the {@link TypedProperties} instance. + */ + private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) { + return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs) + .getProps(true); + } + + public static class Config implements Serializable { + @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true) + public String basePath = null; + + @Parameter(names = {"--continuous"}, description = "Running MetadataTableValidator in continuous. " + + "Can use --min-validate-interval-seconds to control validation frequency", required = false) + public boolean continuous = false; + + @Parameter(names = {"--min-validate-interval-seconds"}, + description = "the min validate interval of each validate when set --continuous, default is 10 minutes.") + public Integer minValidateIntervalSeconds = 10 * 60; + + @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for validation", required = false) + public int parallelism = 200; + + @Parameter(names = {"--ignore-failed", "-ig"}, description = "Ignore data table validate failure and continue.", required = false) + public boolean ignoreFailed = false; + + @Parameter(names = {"--assume-date-partitioning"}, description = "Should HoodieWriteClient assume the data is partitioned by dates, i.e three levels from base path." + + "This is a stop-gap to support tables created by versions < 0.3.1. Will be removed eventually", required = false) + public Boolean assumeDatePartitioning = false; + + @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) + public String sparkMaster = null; + + @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false) + public String sparkMemory = "1g"; + + @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " + + "hoodie client") + public String propsFilePath = null; + + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated", + splitter = IdentitySplitter.class) + public List configs = new ArrayList<>(); + + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + + @Override + public String toString() { + return "MetadataTableValidatorConfig {\n" + + " --base-path " + basePath + ", \n" + + " --continuous " + continuous + ", \n" + + " --ignore-failed " + ignoreFailed + ", \n" + + " --min-validate-interval-seconds " + minValidateIntervalSeconds + ", \n" + + " --parallelism " + parallelism + ", \n" + + " --spark-master " + sparkMaster + ", \n" + + " --spark-memory " + sparkMemory + ", \n" + + " --assumeDatePartitioning-memory " + assumeDatePartitioning + ", \n" + + " --props " + propsFilePath + ", \n" + + " --hoodie-conf " + configs + + "\n}"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HoodieMetadataTableValidator.Config config = (HoodieMetadataTableValidator.Config) o; + return basePath.equals(config.basePath) + && Objects.equals(continuous, config.continuous) + && Objects.equals(minValidateIntervalSeconds, config.minValidateIntervalSeconds) + && Objects.equals(parallelism, config.parallelism) + && Objects.equals(ignoreFailed, config.ignoreFailed) + && Objects.equals(sparkMaster, config.sparkMaster) + && Objects.equals(sparkMemory, config.sparkMemory) + && Objects.equals(assumeDatePartitioning, config.assumeDatePartitioning) + && Objects.equals(propsFilePath, config.propsFilePath) + && Objects.equals(configs, config.configs); + } + + @Override + public int hashCode() { + return Objects.hash(basePath, continuous, minValidateIntervalSeconds, parallelism, ignoreFailed, sparkMaster, sparkMemory, + assumeDatePartitioning, propsFilePath, configs, help); + } + } + + public static void main(String[] args) { + final Config cfg = new Config(); + JCommander cmd = new JCommander(cfg, null, args); + + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + + SparkConf sparkConf = UtilHelpers.buildSparkConf("Hoodie-Data-Table-Validator", cfg.sparkMaster); + sparkConf.set("spark.executor.memory", cfg.sparkMemory); + JavaSparkContext jsc = new JavaSparkContext(sparkConf); + + HoodieDataTableValidator validator = new HoodieDataTableValidator(jsc, cfg); + + try { + validator.run(); + } catch (Throwable throwable) { + LOG.error("Fail to do hoodie Data table validation for " + validator.cfg, throwable); + } finally { + jsc.stop(); + } + } + + public void run() { + try { + LOG.info(cfg); + if (cfg.continuous) { + LOG.info(" ****** do hoodie data table validation in CONTINUOUS mode ******"); + doHoodieDataTableValidationContinuous(); + } else { + LOG.info(" ****** do hoodie data table validation once ******"); + doHoodieDataTableValidationOnce(); + } + } catch (Exception e) { + throw new HoodieException("Unable to do hoodie data table validation in " + cfg.basePath, e); + } finally { + + if (asyncDataTableValidateService.isPresent()) { + asyncDataTableValidateService.get().shutdown(true); + } + } + } + + private void doHoodieDataTableValidationOnce() { + try { + doDataTableValidation(); + } catch (HoodieValidationException e) { + LOG.error("Metadata table validation failed to HoodieValidationException", e); + if (!cfg.ignoreFailed) { + throw e; + } + } + } + + private void doHoodieDataTableValidationContinuous() { + asyncDataTableValidateService.ifPresent(service -> { + service.start(null); + try { + service.waitForShutdown(); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } + }); + } + + public void doDataTableValidation() { + boolean finalResult = true; + metaClient.reloadActiveTimeline(); + String basePath = metaClient.getBasePath(); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + try { + HoodieTableMetadata tableMetadata = new FileSystemBackedTableMetadata( + engineContext, engineContext.getHadoopConf(), cfg.basePath, cfg.assumeDatePartitioning); + List allDataFilePaths = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath); + // verify that no data files present with commit time < earliest commit in active timeline. + if (metaClient.getActiveTimeline().firstInstant().isPresent()) { + String earliestInstant = metaClient.getActiveTimeline().firstInstant().get().getTimestamp(); + List danglingFilePaths = allDataFilePaths.stream().filter(path -> { + String instantTime = FSUtils.getCommitTime(path.getName()); + return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN, earliestInstant); + }).collect(Collectors.toList()); + + if (!danglingFilePaths.isEmpty() && danglingFilePaths.size() > 0) { + LOG.error("Data table validation failed due to dangling files count " + danglingFilePaths.size() + ", found before active timeline"); + danglingFilePaths.forEach(entry -> LOG.error("Dangling file: " + entry.toString())); + finalResult = false; + if (!cfg.ignoreFailed) { + throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFilePaths.size()); + } + } + + // Verify that for every completed commit in active timeline, there are no extra files found apart from what is present in + // commit metadata. + Map> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles( + metaClient.getBasePath(), allDataFilePaths); + HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline(); + List hoodieInstants = activeTimeline.filterCompletedInstants().getInstants().collect(Collectors.toList()); + + List danglingFiles = engineContext.flatMap(hoodieInstants, instant -> { + Option> filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline( + activeTimeline, instant); + List baseAndLogFilesFromFs = instantToFilesMap.containsKey(instant.getTimestamp()) ? instantToFilesMap.get(instant.getTimestamp()) + : Collections.emptyList(); + if (!baseAndLogFilesFromFs.isEmpty()) { + Set danglingInstantFiles = new HashSet<>(baseAndLogFilesFromFs); + if (filesFromTimeline.isPresent()) { + danglingInstantFiles.removeAll(filesFromTimeline.get()); + } + return new ArrayList<>(danglingInstantFiles).stream(); + } else { + return Stream.empty(); + } + }, hoodieInstants.size()).stream().collect(Collectors.toList()); + + if (!danglingFiles.isEmpty()) { + LOG.error("Data table validation failed due to extra files found for completed commits " + danglingFiles.size()); + danglingFiles.forEach(entry -> LOG.error("Dangling file: " + entry.toString())); + finalResult = false; + if (!cfg.ignoreFailed) { + throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFiles.size()); + } + } + } + } catch (Exception e) { + LOG.error("Data table validation failed due to " + e.getMessage(), e); + if (!cfg.ignoreFailed) { + throw new HoodieValidationException("Data table validation failed due to " + e.getMessage(), e); + } + } + + if (finalResult) { + LOG.info("Data table validation succeeded."); + } else { + LOG.warn("Data table validation failed."); + } + } + + public class AsyncDataTableValidateService extends HoodieAsyncService { + private final transient ExecutorService executor = Executors.newSingleThreadExecutor(); + + @Override + protected Pair startService() { + return Pair.of(CompletableFuture.supplyAsync(() -> { + while (true) { + try { + long start = System.currentTimeMillis(); + doDataTableValidation(); + long toSleepMs = cfg.minValidateIntervalSeconds * 1000 - (System.currentTimeMillis() - start); + + if (toSleepMs > 0) { + LOG.info("Last validate ran less than min validate interval: " + cfg.minValidateIntervalSeconds + " s, sleep: " + + toSleepMs + " ms."); + Thread.sleep(toSleepMs); + } + } catch (HoodieValidationException e) { + LOG.error("Shutting down AsyncDataTableValidateService due to HoodieValidationException", e); + if (!cfg.ignoreFailed) { + throw e; + } + } catch (InterruptedException e) { + // ignore InterruptedException here. + } + } + }, executor), executor); + } + } +} \ No newline at end of file diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java new file mode 100644 index 0000000000000..f9b0e1a86d6af --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java @@ -0,0 +1,757 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.async.HoodieAsyncService; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.BaseFile; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.model.HoodieColumnRangeMetadata; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.TableSchemaResolver; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ParquetUtils; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieValidationException; +import org.apache.hudi.io.storage.HoodieFileReader; +import org.apache.hudi.io.storage.HoodieFileReaderFactory; +import org.apache.hudi.metadata.HoodieTableMetadata; +import org.apache.hudi.utilities.util.BloomFilterData; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import jline.internal.Log; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +import java.io.IOException; +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.stream.Collectors; + +/** + * A validator with spark-submit to compare information, such as partitions, file listing, index, etc., + * between metadata table and filesystem. + *

    + * There are five validation tasks, that can be enabled independently through the following CLI options: + * - `--validate-latest-file-slices`: validate latest file slices for all partitions. + * - `--validate-latest-base-files`: validate latest base files for all partitions. + * - `--validate-all-file-groups`: validate all file groups, and all file slices within file groups. + * - `--validate-all-column-stats`: validate column stats for all columns in the schema + * - `--validate-bloom-filters`: validate bloom filters of base files + *

    + * - Default : This validator will compare the results between metadata table and filesystem only once. + *

    + * Example command: + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieMetadataTableValidator \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master spark://xxxx:7077 \ + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath \ + * --validate-latest-file-slices \ + * --validate-latest-base-files \ + * --validate-all-file-groups + * ``` + * + *

    + * Also You can set `--continuous` for long running this validator. + * And use `--min-validate-interval-seconds` to control the validation frequency, default is 10 minutes. + *

    + * Example command: + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieMetadataTableValidator \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master spark://xxxx:7077 \ + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath \ + * --validate-latest-file-slices \ + * --validate-latest-base-files \ + * --validate-all-file-groups \ + * --continuous \ + * --min-validate-interval-seconds 60 + * ``` + * + */ +public class HoodieMetadataTableValidator implements Serializable { + + private static final Logger LOG = LogManager.getLogger(HoodieMetadataTableValidator.class); + + // Spark context + private transient JavaSparkContext jsc; + // config + private Config cfg; + // Properties with source, hoodie client, key generator etc. + private TypedProperties props; + + private HoodieTableMetaClient metaClient; + + protected transient Option asyncMetadataTableValidateService; + + public HoodieMetadataTableValidator(HoodieTableMetaClient metaClient) { + this.metaClient = metaClient; + } + + public HoodieMetadataTableValidator(JavaSparkContext jsc, Config cfg) { + this.jsc = jsc; + this.cfg = cfg; + + this.props = cfg.propsFilePath == null + ? UtilHelpers.buildProperties(cfg.configs) + : readConfigFromFileSystem(jsc, cfg); + + this.metaClient = HoodieTableMetaClient.builder() + .setConf(jsc.hadoopConfiguration()).setBasePath(cfg.basePath) + .setLoadActiveTimelineOnLoad(true) + .build(); + + this.asyncMetadataTableValidateService = cfg.continuous ? Option.of(new AsyncMetadataTableValidateService()) : Option.empty(); + } + + /** + * Reads config from the file system. + * + * @param jsc {@link JavaSparkContext} instance. + * @param cfg {@link Config} instance. + * @return the {@link TypedProperties} instance. + */ + private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) { + return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs) + .getProps(true); + } + + public static class Config implements Serializable { + @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = true) + public String basePath = null; + + @Parameter(names = {"--continuous"}, description = "Running MetadataTableValidator in continuous. " + + "Can use --min-validate-interval-seconds to control validation frequency", required = false) + public boolean continuous = false; + + @Parameter(names = {"--validate-latest-file-slices"}, description = "Validate latest file slices for all partitions.", required = false) + public boolean validateLatestFileSlices = false; + + @Parameter(names = {"--validate-latest-base-files"}, description = "Validate latest base files for all partitions.", required = false) + public boolean validateLatestBaseFiles = false; + + @Parameter(names = {"--validate-all-file-groups"}, description = "Validate all file groups, and all file slices within file groups.", required = false) + public boolean validateAllFileGroups = false; + + @Parameter(names = {"--validate-all-column-stats"}, description = "Validate column stats for all columns in the schema", required = false) + public boolean validateAllColumnStats = false; + + @Parameter(names = {"--validate-bloom-filters"}, description = "Validate bloom filters of base files", required = false) + public boolean validateBloomFilters = false; + + @Parameter(names = {"--min-validate-interval-seconds"}, + description = "the min validate interval of each validate when set --continuous, default is 10 minutes.") + public Integer minValidateIntervalSeconds = 10 * 60; + + @Parameter(names = {"--parallelism", "-pl"}, description = "Parallelism for valuation", required = false) + public int parallelism = 200; + + @Parameter(names = {"--ignore-failed", "-ig"}, description = "Ignore metadata validate failure and continue.", required = false) + public boolean ignoreFailed = false; + + @Parameter(names = {"--spark-master", "-ms"}, description = "Spark master", required = false) + public String sparkMaster = null; + + @Parameter(names = {"--spark-memory", "-sm"}, description = "spark memory to use", required = false) + public String sparkMemory = "1g"; + + @Parameter(names = {"--assume-date-partitioning"}, description = "Should HoodieWriteClient assume the data is partitioned by dates, i.e three levels from base path." + + "This is a stop-gap to support tables created by versions < 0.3.1. Will be removed eventually", required = false) + public Boolean assumeDatePartitioning = false; + + @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " + + "hoodie client") + public String propsFilePath = null; + + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--props\") can also be passed command line using this parameter. This can be repeated", + splitter = IdentitySplitter.class) + public List configs = new ArrayList<>(); + + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + + @Override + public String toString() { + return "MetadataTableValidatorConfig {\n" + + " --base-path " + basePath + ", \n" + + " --validate-latest-file-slices " + validateLatestFileSlices + ", \n" + + " --validate-latest-base-files " + validateLatestBaseFiles + ", \n" + + " --validate-all-file-groups " + validateAllFileGroups + ", \n" + + " --validate-all-column-stats " + validateAllColumnStats + ", \n" + + " --validate-bloom-filters " + validateBloomFilters + ", \n" + + " --continuous " + continuous + ", \n" + + " --ignore-failed " + ignoreFailed + ", \n" + + " --min-validate-interval-seconds " + minValidateIntervalSeconds + ", \n" + + " --parallelism " + parallelism + ", \n" + + " --spark-master " + sparkMaster + ", \n" + + " --spark-memory " + sparkMemory + ", \n" + + " --assumeDatePartitioning-memory " + assumeDatePartitioning + ", \n" + + " --props " + propsFilePath + ", \n" + + " --hoodie-conf " + configs + + "\n}"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Config config = (Config) o; + return basePath.equals(config.basePath) + && Objects.equals(continuous, config.continuous) + && Objects.equals(validateLatestFileSlices, config.validateLatestFileSlices) + && Objects.equals(validateLatestBaseFiles, config.validateLatestBaseFiles) + && Objects.equals(validateAllFileGroups, config.validateAllFileGroups) + && Objects.equals(validateAllColumnStats, config.validateAllColumnStats) + && Objects.equals(validateBloomFilters, config.validateBloomFilters) + && Objects.equals(minValidateIntervalSeconds, config.minValidateIntervalSeconds) + && Objects.equals(parallelism, config.parallelism) + && Objects.equals(ignoreFailed, config.ignoreFailed) + && Objects.equals(sparkMaster, config.sparkMaster) + && Objects.equals(sparkMemory, config.sparkMemory) + && Objects.equals(assumeDatePartitioning, config.assumeDatePartitioning) + && Objects.equals(propsFilePath, config.propsFilePath) + && Objects.equals(configs, config.configs); + } + + @Override + public int hashCode() { + return Objects.hash(basePath, continuous, validateLatestFileSlices, validateLatestBaseFiles, + validateAllFileGroups, validateAllColumnStats, validateBloomFilters, minValidateIntervalSeconds, + parallelism, ignoreFailed, sparkMaster, sparkMemory, assumeDatePartitioning, propsFilePath, configs, help); + } + } + + public static void main(String[] args) { + final Config cfg = new Config(); + JCommander cmd = new JCommander(cfg, null, args); + + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + + SparkConf sparkConf = UtilHelpers.buildSparkConf("Hoodie-Metadata-Table-Validator", cfg.sparkMaster); + sparkConf.set("spark.executor.memory", cfg.sparkMemory); + JavaSparkContext jsc = new JavaSparkContext(sparkConf); + + HoodieMetadataTableValidator validator = new HoodieMetadataTableValidator(jsc, cfg); + + try { + validator.run(); + } catch (Throwable throwable) { + LOG.error("Fail to do hoodie metadata table validation for " + validator.cfg, throwable); + } finally { + jsc.stop(); + } + } + + public void run() { + try { + LOG.info(cfg); + if (cfg.continuous) { + LOG.info(" ****** do hoodie metadata table validation in CONTINUOUS mode ******"); + doHoodieMetadataTableValidationContinuous(); + } else { + LOG.info(" ****** do hoodie metadata table validation once ******"); + doHoodieMetadataTableValidationOnce(); + } + } catch (Exception e) { + throw new HoodieException("Unable to do hoodie metadata table validation in " + cfg.basePath, e); + } finally { + + if (asyncMetadataTableValidateService.isPresent()) { + asyncMetadataTableValidateService.get().shutdown(true); + } + } + } + + private void doHoodieMetadataTableValidationOnce() { + try { + doMetadataTableValidation(); + } catch (HoodieValidationException e) { + LOG.error("Metadata table validation failed to HoodieValidationException", e); + if (!cfg.ignoreFailed) { + throw e; + } + } + } + + private void doHoodieMetadataTableValidationContinuous() { + asyncMetadataTableValidateService.ifPresent(service -> { + service.start(null); + try { + service.waitForShutdown(); + } catch (Exception e) { + throw new HoodieException(e.getMessage(), e); + } + }); + } + + public void doMetadataTableValidation() { + boolean finalResult = true; + metaClient.reloadActiveTimeline(); + String basePath = metaClient.getBasePath(); + HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); + List allPartitions = validatePartitions(engineContext, basePath); + HoodieMetadataValidationContext metadataTableBasedContext = + new HoodieMetadataValidationContext(engineContext, cfg, metaClient, true); + HoodieMetadataValidationContext fsBasedContext = + new HoodieMetadataValidationContext(engineContext, cfg, metaClient, false); + + List result = engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> { + try { + validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath); + LOG.info("Metadata table validation succeeded for " + partitionPath); + return true; + } catch (HoodieValidationException e) { + LOG.error("Metadata table validation failed for " + partitionPath + " due to HoodieValidationException", e); + if (!cfg.ignoreFailed) { + throw e; + } + return false; + } + }).collectAsList(); + + for (Boolean res : result) { + finalResult &= res; + } + + if (finalResult) { + LOG.info("Metadata table validation succeeded."); + } else { + LOG.warn("Metadata table validation failed."); + } + } + + /** + * Compare the listing partitions result between metadata table and fileSystem. + */ + private List validatePartitions(HoodieSparkEngineContext engineContext, String basePath) { + // compare partitions + List allPartitionPathsFromFS = FSUtils.getAllPartitionPaths(engineContext, basePath, false, cfg.assumeDatePartitioning); + List allPartitionPathsMeta = FSUtils.getAllPartitionPaths(engineContext, basePath, true, cfg.assumeDatePartitioning); + + Collections.sort(allPartitionPathsFromFS); + Collections.sort(allPartitionPathsMeta); + + if (allPartitionPathsFromFS.size() != allPartitionPathsMeta.size() + || !allPartitionPathsFromFS.equals(allPartitionPathsMeta)) { + String message = "Compare Partitions Failed! " + "AllPartitionPathsFromFS : " + allPartitionPathsFromFS + " and allPartitionPathsMeta : " + allPartitionPathsMeta; + LOG.error(message); + throw new HoodieValidationException(message); + } + + return allPartitionPathsMeta; + } + + /** + * Compare the file listing and index data between metadata table and fileSystem. + * For now, validate five kinds of apis: + * 1. HoodieMetadataFileSystemView::getLatestFileSlices + * 2. HoodieMetadataFileSystemView::getLatestBaseFiles + * 3. HoodieMetadataFileSystemView::getAllFileGroups and HoodieMetadataFileSystemView::getAllFileSlices + * 4. HoodieBackedTableMetadata::getColumnStats + * 5. HoodieBackedTableMetadata::getBloomFilters + * + * @param metadataTableBasedContext Validation context containing information based on metadata table + * @param fsBasedContext Validation context containing information based on the file system + * @param partitionPath Partition path String + */ + private void validateFilesInPartition( + HoodieMetadataValidationContext metadataTableBasedContext, + HoodieMetadataValidationContext fsBasedContext, String partitionPath) { + if (cfg.validateLatestFileSlices) { + validateLatestFileSlices(metadataTableBasedContext, fsBasedContext, partitionPath); + } + + if (cfg.validateLatestBaseFiles) { + validateLatestBaseFiles(metadataTableBasedContext, fsBasedContext, partitionPath); + } + + if (cfg.validateAllFileGroups) { + validateAllFileGroups(metadataTableBasedContext, fsBasedContext, partitionPath); + } + + if (cfg.validateAllColumnStats) { + validateAllColumnStats(metadataTableBasedContext, fsBasedContext, partitionPath); + } + + if (cfg.validateBloomFilters) { + validateBloomFilters(metadataTableBasedContext, fsBasedContext, partitionPath); + } + } + + private void validateAllFileGroups( + HoodieMetadataValidationContext metadataTableBasedContext, + HoodieMetadataValidationContext fsBasedContext, String partitionPath) { + List allFileSlicesFromMeta = metadataTableBasedContext + .getSortedAllFileGroupList(partitionPath).stream() + .flatMap(HoodieFileGroup::getAllFileSlices).sorted(new FileSliceComparator()) + .collect(Collectors.toList()); + List allFileSlicesFromFS = fsBasedContext + .getSortedAllFileGroupList(partitionPath).stream() + .flatMap(HoodieFileGroup::getAllFileSlices).sorted(new FileSliceComparator()) + .collect(Collectors.toList()); + + LOG.debug("All file slices from metadata: " + allFileSlicesFromMeta + ". For partitions " + partitionPath); + LOG.debug("All file slices from direct listing: " + allFileSlicesFromFS + ". For partitions " + partitionPath); + validate(allFileSlicesFromMeta, allFileSlicesFromFS, partitionPath, "file slices"); + + LOG.info("Validation of all file groups succeeded for partition " + partitionPath); + } + + /** + * Compare getLatestBaseFiles between metadata table and fileSystem. + */ + private void validateLatestBaseFiles( + HoodieMetadataValidationContext metadataTableBasedContext, + HoodieMetadataValidationContext fsBasedContext, String partitionPath) { + + List latestFilesFromMetadata = metadataTableBasedContext.getSortedLatestBaseFileList(partitionPath); + List latestFilesFromFS = fsBasedContext.getSortedLatestBaseFileList(partitionPath); + + LOG.debug("Latest base file from metadata: " + latestFilesFromMetadata + ". For partitions " + partitionPath); + LOG.debug("Latest base file from direct listing: " + latestFilesFromFS + ". For partitions " + partitionPath); + if (latestFilesFromMetadata.size() != latestFilesFromFS.size() + || !latestFilesFromMetadata.equals(latestFilesFromFS)) { + String message = "Validation of metadata get latest base file for partition " + partitionPath + " failed. " + + "Latest base file from metadata: " + latestFilesFromMetadata + + "Latest base file from direct listing: " + latestFilesFromFS; + LOG.error(message); + throw new HoodieValidationException(message); + } else { + LOG.info("Validation of getLatestBaseFiles succeeded for partition " + partitionPath); + } + } + + /** + * Compare getLatestFileSlices between metadata table and fileSystem. + */ + private void validateLatestFileSlices( + HoodieMetadataValidationContext metadataTableBasedContext, + HoodieMetadataValidationContext fsBasedContext, String partitionPath) { + + List latestFileSlicesFromMetadataTable = metadataTableBasedContext.getSortedLatestFileSliceList(partitionPath); + List latestFileSlicesFromFS = fsBasedContext.getSortedLatestFileSliceList(partitionPath); + + LOG.debug("Latest file list from metadata: " + latestFileSlicesFromMetadataTable + ". For partition " + partitionPath); + LOG.debug("Latest file list from direct listing: " + latestFileSlicesFromFS + ". For partition " + partitionPath); + + validate(latestFileSlicesFromMetadataTable, latestFileSlicesFromFS, partitionPath, "file slices"); + LOG.info("Validation of getLatestFileSlices succeeded for partition " + partitionPath); + } + + private void validateAllColumnStats( + HoodieMetadataValidationContext metadataTableBasedContext, + HoodieMetadataValidationContext fsBasedContext, String partitionPath) { + List latestBaseFilenameList = fsBasedContext.getSortedLatestBaseFileList(partitionPath) + .stream().map(BaseFile::getFileName).collect(Collectors.toList()); + List> metadataBasedColStats = metadataTableBasedContext + .getSortedColumnStatsList(partitionPath, latestBaseFilenameList); + List> fsBasedColStats = fsBasedContext + .getSortedColumnStatsList(partitionPath, latestBaseFilenameList); + + validate(metadataBasedColStats, fsBasedColStats, partitionPath, "column stats"); + + LOG.info("Validation of column stats succeeded for partition " + partitionPath); + } + + private void validateBloomFilters( + HoodieMetadataValidationContext metadataTableBasedContext, + HoodieMetadataValidationContext fsBasedContext, String partitionPath) { + List latestBaseFilenameList = fsBasedContext.getSortedLatestBaseFileList(partitionPath) + .stream().map(BaseFile::getFileName).collect(Collectors.toList()); + List metadataBasedBloomFilters = metadataTableBasedContext + .getSortedBloomFilterList(partitionPath, latestBaseFilenameList); + List fsBasedBloomFilters = fsBasedContext + .getSortedBloomFilterList(partitionPath, latestBaseFilenameList); + + validate(metadataBasedBloomFilters, fsBasedBloomFilters, partitionPath, "bloom filters"); + + LOG.info("Validation of bloom filters succeeded for partition " + partitionPath); + } + + private void validate( + List infoListFromMetadataTable, List infoListFromFS, String partitionPath, String label) { + if (infoListFromMetadataTable.size() != infoListFromFS.size() + || !infoListFromMetadataTable.equals(infoListFromFS)) { + String message = String.format("Validation of %s for partition %s failed." + + "\n%s from metadata: %s\n%s from file system and base files: %s", + label, partitionPath, label, infoListFromMetadataTable, label, infoListFromFS); + LOG.error(message); + throw new HoodieValidationException(message); + } else { + LOG.info(String.format("Validation of %s succeeded for partition %s", label, partitionPath)); + } + } + + public class AsyncMetadataTableValidateService extends HoodieAsyncService { + private final transient ExecutorService executor = Executors.newSingleThreadExecutor(); + + @Override + protected Pair startService() { + return Pair.of(CompletableFuture.supplyAsync(() -> { + while (true) { + try { + long start = System.currentTimeMillis(); + doMetadataTableValidation(); + long toSleepMs = cfg.minValidateIntervalSeconds * 1000 - (System.currentTimeMillis() - start); + + if (toSleepMs > 0) { + LOG.info("Last validate ran less than min validate interval: " + cfg.minValidateIntervalSeconds + " s, sleep: " + + toSleepMs + " ms."); + Thread.sleep(toSleepMs); + } + } catch (HoodieValidationException e) { + LOG.error("Shutting down AsyncMetadataTableValidateService due to HoodieValidationException", e); + if (!cfg.ignoreFailed) { + throw e; + } + } catch (InterruptedException e) { + // ignore InterruptedException here. + } + } + }, executor), executor); + } + } + + public static class FileSliceComparator implements Comparator, Serializable { + + @Override + public int compare(FileSlice o1, FileSlice o2) { + return (o1.getPartitionPath() + o1.getFileId() + o1.getBaseInstantTime()) + .compareTo(o2.getPartitionPath() + o2.getFileId() + o2.getBaseInstantTime()); + } + } + + public static class HoodieBaseFileComparator implements Comparator, Serializable { + + @Override + public int compare(HoodieBaseFile o1, HoodieBaseFile o2) { + return o1.getPath().compareTo(o2.getPath()); + } + } + + public static class HoodieFileGroupComparator implements Comparator, Serializable { + + @Override + public int compare(HoodieFileGroup o1, HoodieFileGroup o2) { + return o1.getFileGroupId().compareTo(o2.getFileGroupId()); + } + } + + public static class HoodieColumnRangeMetadataComparator + implements Comparator>, Serializable { + + @Override + public int compare(HoodieColumnRangeMetadata o1, HoodieColumnRangeMetadata o2) { + return o1.toString().compareTo(o2.toString()); + } + } + + /** + * Class for storing relevant information for metadata table validation. + *

    + * If metadata table is disabled, the APIs provide the information, e.g., file listing, + * index, from the file system and base files. If metadata table is enabled, the APIs + * provide the information from the metadata table. The same API is expected to return + * the same information regardless of whether metadata table is enabled, which is + * verified in the {@link HoodieMetadataTableValidator}. + */ + private static class HoodieMetadataValidationContext implements Serializable { + private HoodieTableMetaClient metaClient; + private HoodieTableFileSystemView fileSystemView; + private HoodieTableMetadata tableMetadata; + private boolean enableMetadataTable; + private List allColumnNameList; + + public HoodieMetadataValidationContext( + HoodieEngineContext engineContext, Config cfg, HoodieTableMetaClient metaClient, + boolean enableMetadataTable) { + this.metaClient = metaClient; + this.enableMetadataTable = enableMetadataTable; + HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder() + .enable(enableMetadataTable) + .withMetadataIndexBloomFilter(enableMetadataTable) + .withMetadataIndexColumnStats(enableMetadataTable) + .withMetadataIndexForAllColumns(enableMetadataTable) + .withAssumeDatePartitioning(cfg.assumeDatePartitioning) + .build(); + this.fileSystemView = FileSystemViewManager.createInMemoryFileSystemView(engineContext, + metaClient, metadataConfig); + this.tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, metaClient.getBasePath(), + FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue()); + if (metaClient.getCommitsTimeline().filterCompletedInstants().countInstants() > 0) { + this.allColumnNameList = getAllColumnNames(); + } + } + + public List getSortedLatestBaseFileList(String partitionPath) { + return fileSystemView.getLatestBaseFiles(partitionPath) + .sorted(new HoodieBaseFileComparator()).collect(Collectors.toList()); + } + + public List getSortedLatestFileSliceList(String partitionPath) { + return fileSystemView.getLatestFileSlices(partitionPath) + .sorted(new FileSliceComparator()).collect(Collectors.toList()); + } + + public List getSortedAllFileGroupList(String partitionPath) { + return fileSystemView.getAllFileGroups(partitionPath) + .sorted(new HoodieFileGroupComparator()).collect(Collectors.toList()); + } + + public List> getSortedColumnStatsList( + String partitionPath, List baseFileNameList) { + LOG.info("All column names for getting column stats: " + allColumnNameList); + if (enableMetadataTable) { + List> partitionFileNameList = baseFileNameList.stream() + .map(filename -> Pair.of(partitionPath, filename)).collect(Collectors.toList()); + return allColumnNameList.stream() + .flatMap(columnName -> + tableMetadata.getColumnStats(partitionFileNameList, columnName).values().stream() + .map(stats -> new HoodieColumnRangeMetadata<>( + stats.getFileName(), + columnName, + stats.getMinValue(), + stats.getMaxValue(), + stats.getNullCount(), + stats.getValueCount(), + stats.getTotalSize(), + stats.getTotalUncompressedSize())) + .collect(Collectors.toList()) + .stream()) + .sorted(new HoodieColumnRangeMetadataComparator()) + .collect(Collectors.toList()); + } else { + return baseFileNameList.stream().flatMap(filename -> + new ParquetUtils().readRangeFromParquetMetadata( + metaClient.getHadoopConf(), + new Path(new Path(metaClient.getBasePath(), partitionPath), filename), + allColumnNameList).stream()) + .map(rangeMetadata -> new HoodieColumnRangeMetadata( + rangeMetadata.getFilePath(), + rangeMetadata.getColumnName(), + // Note: here we ignore the type in the validation, + // since column stats from metadata table store the min/max values as String + rangeMetadata.getMinValue().toString(), + rangeMetadata.getMaxValue().toString(), + rangeMetadata.getNullCount(), + rangeMetadata.getValueCount(), + rangeMetadata.getTotalSize(), + rangeMetadata.getTotalUncompressedSize() + )) + .sorted(new HoodieColumnRangeMetadataComparator()) + .collect(Collectors.toList()); + } + } + + public List getSortedBloomFilterList( + String partitionPath, List baseFileNameList) { + if (enableMetadataTable) { + List> partitionFileNameList = baseFileNameList.stream() + .map(filename -> Pair.of(partitionPath, filename)).collect(Collectors.toList()); + return tableMetadata.getBloomFilters(partitionFileNameList).entrySet().stream() + .map(entry -> BloomFilterData.builder() + .setPartitionPath(entry.getKey().getKey()) + .setFilename(entry.getKey().getValue()) + .setBloomFilter(entry.getValue()) + .build()) + .sorted() + .collect(Collectors.toList()); + } else { + return baseFileNameList.stream() + .map(filename -> readBloomFilterFromFile(partitionPath, filename)) + .filter(Option::isPresent) + .map(Option::get) + .sorted() + .collect(Collectors.toList()); + } + } + + private List getAllColumnNames() { + TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient); + try { + return schemaResolver.getTableAvroSchema().getFields().stream() + .map(entry -> entry.name()).collect(Collectors.toList()); + } catch (Exception e) { + throw new HoodieException("Failed to get all column names for " + metaClient.getBasePath()); + } + } + + private Option readBloomFilterFromFile(String partitionPath, String filename) { + Path path = new Path(new Path(metaClient.getBasePath(), partitionPath), filename); + HoodieFileReader fileReader; + try { + fileReader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), path); + } catch (IOException e) { + Log.error("Failed to get file reader for " + path + " " + e.getMessage()); + return Option.empty(); + } + final BloomFilter fileBloomFilter = fileReader.readBloomFilter(); + if (fileBloomFilter == null) { + Log.error("Failed to read bloom filter for " + path); + return Option.empty(); + } + return Option.of(BloomFilterData.builder() + .setPartitionPath(partitionPath) + .setFilename(filename) + .setBloomFilter(ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes())) + .build()); + } + } +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieRepairTool.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieRepairTool.java index d6b74c8099dfc..7d725ed6af37a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieRepairTool.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieRepairTool.java @@ -48,7 +48,6 @@ import java.io.Serializable; import java.security.SecureRandom; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -345,7 +344,7 @@ static boolean deleteFiles( boolean doRepair( Option startingInstantOption, Option endingInstantOption, boolean isDryRun) throws IOException { // Scans all partitions to find base and log files in the base path - List allFilesInPartitions = getBaseAndLogFilePathsFromFileSystem(); + List allFilesInPartitions = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath); // Buckets the files based on instant time // instant time -> relative paths of base and log files to base path Map> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles( @@ -388,22 +387,6 @@ boolean doRepair( return true; } - /** - * @return All hoodie files of the table from the file system. - * @throws IOException upon errors. - */ - List getBaseAndLogFilePathsFromFileSystem() throws IOException { - List allPartitionPaths = tableMetadata.getAllPartitionPaths() - .stream().map(partitionPath -> - FSUtils.getPartitionPath(cfg.basePath, partitionPath).toString()) - .collect(Collectors.toList()); - return tableMetadata.getAllFilesInPartitions(allPartitionPaths).values().stream() - .map(fileStatuses -> - Arrays.stream(fileStatuses).map(fileStatus -> fileStatus.getPath()).collect(Collectors.toList())) - .flatMap(list -> list.stream()) - .collect(Collectors.toList()); - } - /** * Undoes repair for UNDO mode. * diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java index 81c5caf82142f..8690ff1cfb132 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java @@ -26,7 +26,9 @@ import org.apache.hudi.common.config.DFSPropertiesConfiguration; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.util.Functions.Function1; @@ -220,7 +222,7 @@ public static String parseSchema(FileSystem fs, String schemaFile) throws Except return new String(buf.array()); } - private static SparkConf buildSparkConf(String appName, String defaultMaster) { + public static SparkConf buildSparkConf(String appName, String defaultMaster) { return buildSparkConf(appName, defaultMaster, new HashMap<>()); } @@ -303,6 +305,18 @@ public static int handleErrors(JavaSparkContext jsc, String instantTime, JavaRDD return -1; } + public static int handleErrors(HoodieCommitMetadata metadata, String instantTime) { + List writeStats = metadata.getWriteStats(); + long errorsCount = writeStats.stream().mapToLong(HoodieWriteStat::getTotalWriteErrors).sum(); + if (errorsCount == 0) { + LOG.info(String.format("Finish job with %s instant time.", instantTime)); + return 0; + } + + LOG.error(String.format("Job failed with %d errors.", errorsCount)); + return -1; + } + /** * Returns a factory for creating connections to the given JDBC URL. * diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java index 682c2daa1f68e..833fce295e326 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/BootstrapExecutor.java @@ -160,10 +160,12 @@ public void execute() throws IOException { * Sync to Hive. */ private void syncHive() { - if (cfg.enableHiveSync) { + if (cfg.enableHiveSync || cfg.enableMetaSync) { HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat); - LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" - + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); + HiveConf hiveConf = new HiveConf(fs.getConf(), HiveConf.class); + hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname,hiveSyncConfig.metastoreUris); + LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString()); + LOG.info("Hive Sync Conf => " + hiveSyncConfig); new HiveSyncTool(hiveSyncConfig, new HiveConf(configuration, HiveConf.class), fs).syncHoodieTable(); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index c7b29c9f0f520..082a9b1d5e82d 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -29,6 +29,7 @@ import org.apache.hudi.client.embedded.EmbeddedTimelineService; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -57,6 +58,7 @@ import org.apache.hudi.keygen.SimpleKeyGenerator; import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; +import org.apache.hudi.metrics.HoodieMetrics; import org.apache.hudi.sync.common.AbstractSyncTool; import org.apache.hudi.utilities.UtilHelpers; import org.apache.hudi.utilities.callback.kafka.HoodieWriteCommitKafkaCallback; @@ -65,6 +67,7 @@ import org.apache.hudi.utilities.callback.pulsar.HoodieWriteCommitPulsarCallbackConfig; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Config; import org.apache.hudi.utilities.exception.HoodieDeltaStreamerException; +import org.apache.hudi.utilities.exception.HoodieSourceTimeoutException; import org.apache.hudi.utilities.schema.DelegatingSchemaProvider; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.schema.SchemaSet; @@ -190,6 +193,9 @@ public class DeltaSync implements Serializable { */ private transient Option commitTimelineOpt; + // all commits timeline + private transient Option allCommitsTimelineOpt; + /** * Tracks whether new schema is being seen and creates client accordingly. */ @@ -208,6 +214,8 @@ public class DeltaSync implements Serializable { private transient HoodieDeltaStreamerMetrics metrics; + private transient HoodieMetrics hoodieMetrics; + public DeltaSync(HoodieDeltaStreamer.Config cfg, SparkSession sparkSession, SchemaProvider schemaProvider, TypedProperties props, JavaSparkContext jssc, FileSystem fs, Configuration conf, Function onInitializingHoodieWriteClient) throws IOException { @@ -228,6 +236,7 @@ public DeltaSync(HoodieDeltaStreamer.Config cfg, SparkSession sparkSession, Sche this.transformer = UtilHelpers.createTransformer(cfg.transformerClassNames); this.metrics = new HoodieDeltaStreamerMetrics(getHoodieClientConfig(this.schemaProvider)); + this.hoodieMetrics = new HoodieMetrics(getHoodieClientConfig(this.schemaProvider)); this.formatAdapter = new SourceFormatAdapter( UtilHelpers.createSource(cfg.sourceClassName, props, jssc, sparkSession, schemaProvider, metrics)); @@ -245,15 +254,18 @@ public void refreshTimeline() throws IOException { switch (meta.getTableType()) { case COPY_ON_WRITE: this.commitTimelineOpt = Option.of(meta.getActiveTimeline().getCommitTimeline().filterCompletedInstants()); + this.allCommitsTimelineOpt = Option.of(meta.getActiveTimeline().getAllCommitsTimeline()); break; case MERGE_ON_READ: this.commitTimelineOpt = Option.of(meta.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants()); + this.allCommitsTimelineOpt = Option.of(meta.getActiveTimeline().getAllCommitsTimeline()); break; default: throw new HoodieException("Unsupported table type :" + meta.getTableType()); } } else { this.commitTimelineOpt = Option.empty(); + this.allCommitsTimelineOpt = Option.empty(); String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props); HoodieTableMetaClient.withPropertyBuilder() .setTableType(cfg.tableType) @@ -306,6 +318,14 @@ public Pair, JavaRDD> syncOnce() throws IOException } } + // complete the pending clustering before writing to sink + if (cfg.retryLastPendingInlineClusteringJob && getHoodieClientConfig(this.schemaProvider).inlineClusteringEnabled()) { + Option pendingClusteringInstant = getLastPendingClusteringInstant(allCommitsTimelineOpt); + if (pendingClusteringInstant.isPresent()) { + writeClient.cluster(pendingClusteringInstant.get(), true); + } + } + result = writeToSink(srcRecordsWithCkpt.getRight().getRight(), srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext); } @@ -317,6 +337,14 @@ public Pair, JavaRDD> syncOnce() throws IOException return result; } + private Option getLastPendingClusteringInstant(Option commitTimelineOpt) { + if (commitTimelineOpt.isPresent()) { + Option pendingClusteringInstant = commitTimelineOpt.get().filterPendingReplaceTimeline().lastInstant(); + return pendingClusteringInstant.isPresent() ? Option.of(pendingClusteringInstant.get().getTimestamp()) : Option.empty(); + } + return Option.empty(); + } + /** * Read from Upstream Source and apply transformation if needed. * @@ -348,11 +376,35 @@ public Pair>> readFromSource( .initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath); } + LOG.debug("Checkpoint from config: " + cfg.checkpoint); if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) { resumeCheckpointStr = Option.of(cfg.checkpoint); } LOG.info("Checkpoint to resume from : " + resumeCheckpointStr); + int maxRetryCount = cfg.retryOnSourceFailures ? cfg.maxRetryCount : 1; + int curRetryCount = 0; + Pair>> sourceDataToSync = null; + while (curRetryCount++ < maxRetryCount && sourceDataToSync == null) { + try { + sourceDataToSync = fetchFromSource(resumeCheckpointStr); + } catch (HoodieSourceTimeoutException e) { + if (curRetryCount >= maxRetryCount) { + throw e; + } + try { + LOG.error("Exception thrown while fetching data from source. Msg : " + e.getMessage() + ", class : " + e.getClass() + ", cause : " + e.getCause()); + LOG.error("Sleeping for " + (cfg.retryIntervalSecs) + " before retrying again. Current retry count " + curRetryCount + ", max retry count " + cfg.maxRetryCount); + Thread.sleep(cfg.retryIntervalSecs * 1000); + } catch (InterruptedException ex) { + LOG.error("Ignoring InterruptedException while waiting to retry on source failure " + e.getMessage()); + } + } + } + return sourceDataToSync; + } + + private Pair>> fetchFromSource(Option resumeCheckpointStr) { final Option> avroRDDOptional; final String checkpointStr; SchemaProvider schemaProvider; @@ -391,7 +443,7 @@ public Pair>> readFromSource( targetSchemaProvider = UtilHelpers.createRowBasedSchemaProvider(r.schema(), props, jssc); } return (SchemaProvider) new DelegatingSchemaProvider(props, jssc, - dataAndCheckpoint.getSchemaProvider(), targetSchemaProvider); }) + dataAndCheckpoint.getSchemaProvider(), targetSchemaProvider); }) .orElse(dataAndCheckpoint.getSchemaProvider()); avroRDDOptional = transformed .map(t -> HoodieSparkUtils.createRdd( @@ -408,9 +460,11 @@ public Pair>> readFromSource( schemaProvider = dataAndCheckpoint.getSchemaProvider(); } - if (Objects.equals(checkpointStr, resumeCheckpointStr.orElse(null))) { + if (!cfg.allowCommitOnNoCheckpointChange && Objects.equals(checkpointStr, resumeCheckpointStr.orElse(null))) { LOG.info("No new data, source checkpoint has not changed. Nothing to commit. Old checkpoint=(" - + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")"); + + resumeCheckpointStr + "). New Checkpoint=(" + checkpointStr + ")"); + String commitActionType = CommitUtils.getCommitActionType(cfg.operation, HoodieTableType.valueOf(cfg.tableType)); + hoodieMetrics.updateMetricsForEmptyData(commitActionType); return null; } @@ -428,7 +482,7 @@ public Pair>> readFromSource( KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())))) : DataSourceUtils.createPayload(cfg.payloadClassName, gr); - return new HoodieRecord<>(keyGenerator.getKey(gr), payload); + return new HoodieAvroRecord<>(keyGenerator.getKey(gr), payload); }); return Pair.of(schemaProvider, Pair.of(checkpointStr, records)); @@ -448,6 +502,7 @@ private Option getCheckpointToResume(Option commitTimeli Option commitMetadataOption = getLatestCommitMetadataWithValidCheckpointInfo(commitTimelineOpt.get()); if (commitMetadataOption.isPresent()) { HoodieCommitMetadata commitMetadata = commitMetadataOption.get(); + LOG.debug("Checkpoint reset from metadata: " + commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)); if (cfg.checkpoint != null && (StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)) || !cfg.checkpoint.equals(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)))) { resumeCheckpointStr = Option.of(cfg.checkpoint); @@ -530,6 +585,10 @@ private Pair, JavaRDD> writeToSink(JavaRDD partitions = records.map(record -> record.getPartitionPath()).distinct().collect(); + writeStatusRDD = writeClient.deletePartitions(partitions, instantTime).getWriteStatuses(); + break; default: throw new HoodieDeltaStreamerException("Unknown operation : " + cfg.operation); } @@ -541,7 +600,9 @@ private Pair, JavaRDD> writeToSink(JavaRDD checkpointCommitMetadata = new HashMap<>(); - checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr); + if (checkpointStr != null) { + checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr); + } if (cfg.checkpoint != null) { checkpointCommitMetadata.put(CHECKPOINT_RESET_KEY, cfg.checkpoint); } @@ -655,9 +716,10 @@ private void syncMeta(HoodieDeltaStreamerMetrics metrics) { public void syncHive() { HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat); - LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :" - + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); HiveConf hiveConf = new HiveConf(conf, HiveConf.class); + if (StringUtils.isNullOrEmpty(hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname))) { + hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, hiveSyncConfig.metastoreUris); + } LOG.info("Hive Conf => " + hiveConf.getAllProperties().toString()); LOG.info("Hive Sync Conf => " + hiveSyncConfig.toString()); new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable(); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java index 3ceb0028751a2..c0c141db11de3 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -369,9 +369,25 @@ public static class Config implements Serializable { @Parameter(names = {"--bootstrap-index-class"}, description = "subclass of BootstrapIndex") public String bootstrapIndexClass = HFileBootstrapIndex.class.getName(); + @Parameter(names = {"--retry-on-source-failures"}, description = "Retry on any source failures") + public Boolean retryOnSourceFailures = false; + + @Parameter(names = {"--retry-interval-seconds"}, description = "the retry interval for source failures if --retry-on-source-failures is enabled") + public Integer retryIntervalSecs = 30; + + @Parameter(names = {"--max-retry-count"}, description = "the max retry count if --retry-on-source-failures is enabled") + public Integer maxRetryCount = 3; + + @Parameter(names = {"--allow-commit-on-no-checkpoint-change"}, description = "allow commits even if checkpoint has not changed before and after fetch data" + + "from souce. This might be useful in sources like SqlSource where there is not checkpoint. And is not recommended to enable in continuous mode.") + public Boolean allowCommitOnNoCheckpointChange = false; + @Parameter(names = {"--help", "-h"}, help = true) public Boolean help = false; + @Parameter(names = {"--retry-last-pending-inline-clustering", "-rc"}, description = "Retry last pending inline clustering plan before writing to sink.") + public Boolean retryLastPendingInlineClusteringJob = false; + public boolean isAsyncCompactionEnabled() { return continuousMode && !forceDisableCompaction && HoodieTableType.MERGE_ON_READ.equals(HoodieTableType.valueOf(tableType)); @@ -643,6 +659,10 @@ protected Pair startService() { asyncCompactService.get().enqueuePendingAsyncServiceInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, scheduledCompactionInstantAndRDD.get().getLeft().get())); asyncCompactService.get().waitTillPendingAsyncServiceInstantsReducesTo(cfg.maxPendingCompactions); + if (asyncCompactService.get().hasError()) { + error = true; + throw new HoodieException("Async compaction failed. Shutting down Delta Sync..."); + } } if (clusteringConfig.isAsyncClusteringEnabled()) { Option clusteringInstant = deltaSync.getClusteringInstantOpt(); @@ -650,6 +670,10 @@ protected Pair startService() { LOG.info("Scheduled async clustering for instant: " + clusteringInstant.get()); asyncClusteringService.get().enqueuePendingAsyncServiceInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringInstant.get())); asyncClusteringService.get().waitTillPendingAsyncServiceInstantsReducesTo(cfg.maxPendingClustering); + if (asyncClusteringService.get().hasError()) { + error = true; + throw new HoodieException("Async clustering failed. Shutting down Delta Sync..."); + } } } long toSleepMs = cfg.minSyncIntervalSeconds * 1000 - (System.currentTimeMillis() - start); @@ -668,6 +692,7 @@ protected Pair startService() { } } finally { shutdownAsyncServices(error); + executor.shutdownNow(); } return true; }, executor), executor); @@ -721,13 +746,12 @@ protected Boolean onInitializingWriteClient(SparkRDDWriteClient writeClient) { HoodieTableMetaClient.builder().setConf(new Configuration(jssc.hadoopConfiguration())).setBasePath(cfg.targetBasePath).setLoadActiveTimelineOnLoad(true).build(); List pending = CompactionUtils.getPendingCompactionInstantTimes(meta); pending.forEach(hoodieInstant -> asyncCompactService.get().enqueuePendingAsyncServiceInstant(hoodieInstant)); - asyncCompactService.get().start((error) -> { - // Shutdown DeltaSync - shutdown(false); - return true; - }); + asyncCompactService.get().start(error -> true); try { asyncCompactService.get().waitTillPendingAsyncServiceInstantsReducesTo(cfg.maxPendingCompactions); + if (asyncCompactService.get().hasError()) { + throw new HoodieException("Async compaction failed during write client initialization."); + } } catch (InterruptedException ie) { throw new HoodieException(ie); } @@ -746,12 +770,12 @@ protected Boolean onInitializingWriteClient(SparkRDDWriteClient writeClient) { List pending = ClusteringUtils.getPendingClusteringInstantTimes(meta); LOG.info(String.format("Found %d pending clustering instants ", pending.size())); pending.forEach(hoodieInstant -> asyncClusteringService.get().enqueuePendingAsyncServiceInstant(hoodieInstant)); - asyncClusteringService.get().start((error) -> { - shutdown(false); - return true; - }); + asyncClusteringService.get().start(error -> true); try { asyncClusteringService.get().waitTillPendingAsyncServiceInstantsReducesTo(cfg.maxPendingClustering); + if (asyncClusteringService.get().hasError()) { + throw new HoodieException("Async clustering failed during write client initialization."); + } } catch (InterruptedException e) { throw new HoodieException(e); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java index dc150803e8b38..bcd7b3b7d8ac6 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieMultiTableDeltaStreamer.java @@ -51,6 +51,9 @@ import java.util.Objects; import java.util.Set; +import static org.apache.hudi.utilities.schema.SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP; +import static org.apache.hudi.utilities.schema.SchemaRegistryProvider.Config.TARGET_SCHEMA_REGISTRY_URL_PROP; + /** * Wrapper over HoodieDeltaStreamer.java class. * Helps with ingesting incremental data into hoodie datasets for multiple tables. @@ -152,19 +155,38 @@ private List getTablesToBeIngested(TypedProperties properties) { private void populateSchemaProviderProps(HoodieDeltaStreamer.Config cfg, TypedProperties typedProperties) { if (Objects.equals(cfg.schemaProviderClassName, SchemaRegistryProvider.class.getName())) { + populateSourceRegistryProp(typedProperties); + populateTargetRegistryProp(typedProperties); + } + } + + private void populateTargetRegistryProp(TypedProperties typedProperties) { + String schemaRegistryTargetUrl = typedProperties.getString(TARGET_SCHEMA_REGISTRY_URL_PROP, null); + if (StringUtils.isNullOrEmpty(schemaRegistryTargetUrl)) { String schemaRegistryBaseUrl = typedProperties.getString(Constants.SCHEMA_REGISTRY_BASE_URL_PROP); String schemaRegistrySuffix = typedProperties.getString(Constants.SCHEMA_REGISTRY_URL_SUFFIX_PROP, null); - String sourceSchemaRegistrySuffix; String targetSchemaRegistrySuffix; if (StringUtils.isNullOrEmpty(schemaRegistrySuffix)) { - sourceSchemaRegistrySuffix = typedProperties.getString(Constants.SCHEMA_REGISTRY_SOURCE_URL_SUFFIX); targetSchemaRegistrySuffix = typedProperties.getString(Constants.SCHEMA_REGISTRY_TARGET_URL_SUFFIX); } else { targetSchemaRegistrySuffix = schemaRegistrySuffix; + } + typedProperties.setProperty(TARGET_SCHEMA_REGISTRY_URL_PROP, schemaRegistryBaseUrl + typedProperties.getString(Constants.KAFKA_TOPIC_PROP) + targetSchemaRegistrySuffix); + } + } + + private void populateSourceRegistryProp(TypedProperties typedProperties) { + String schemaRegistrySourceUrl = typedProperties.getString(SRC_SCHEMA_REGISTRY_URL_PROP, null); + if (StringUtils.isNullOrEmpty(schemaRegistrySourceUrl)) { + String schemaRegistryBaseUrl = typedProperties.getString(Constants.SCHEMA_REGISTRY_BASE_URL_PROP); + String schemaRegistrySuffix = typedProperties.getString(Constants.SCHEMA_REGISTRY_URL_SUFFIX_PROP, null); + String sourceSchemaRegistrySuffix; + if (StringUtils.isNullOrEmpty(schemaRegistrySuffix)) { + sourceSchemaRegistrySuffix = typedProperties.getString(Constants.SCHEMA_REGISTRY_SOURCE_URL_SUFFIX); + } else { sourceSchemaRegistrySuffix = schemaRegistrySuffix; } - typedProperties.setProperty(Constants.SOURCE_SCHEMA_REGISTRY_URL_PROP, schemaRegistryBaseUrl + typedProperties.getString(Constants.KAFKA_TOPIC_PROP) + sourceSchemaRegistrySuffix); - typedProperties.setProperty(Constants.TARGET_SCHEMA_REGISTRY_URL_PROP, schemaRegistryBaseUrl + typedProperties.getString(Constants.KAFKA_TOPIC_PROP) + targetSchemaRegistrySuffix); + typedProperties.setProperty(SRC_SCHEMA_REGISTRY_URL_PROP, schemaRegistryBaseUrl + typedProperties.getString(Constants.KAFKA_TOPIC_PROP) + sourceSchemaRegistrySuffix); } } @@ -397,8 +419,6 @@ public void sync() { public static class Constants { public static final String KAFKA_TOPIC_PROP = "hoodie.deltastreamer.source.kafka.topic"; - private static final String SOURCE_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url"; - private static final String TARGET_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.targetUrl"; public static final String HIVE_SYNC_TABLE_PROP = "hoodie.datasource.hive_sync.table"; private static final String SCHEMA_REGISTRY_BASE_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.baseUrl"; private static final String SCHEMA_REGISTRY_URL_SUFFIX_PROP = "hoodie.deltastreamer.schemaprovider.registry.urlSuffix"; diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SchedulerConfGenerator.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SchedulerConfGenerator.java index f5f1f384765ab..b991f9d46cb0b 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SchedulerConfGenerator.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/SchedulerConfGenerator.java @@ -18,6 +18,7 @@ package org.apache.hudi.utilities.deltastreamer; +import org.apache.hudi.SparkConfigs; import org.apache.hudi.async.AsyncCompactService; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.util.Option; @@ -46,7 +47,6 @@ public class SchedulerConfGenerator { public static final String COMPACT_POOL_NAME = AsyncCompactService.COMPACT_POOL_NAME; public static final String SPARK_SCHEDULER_MODE_KEY = "spark.scheduler.mode"; public static final String SPARK_SCHEDULER_FAIR_MODE = "FAIR"; - public static final String SPARK_SCHEDULER_ALLOCATION_FILE_KEY = "spark.scheduler.allocation.file"; private static final String SPARK_SCHEDULING_PATTERN = "\n\n \n" @@ -85,7 +85,7 @@ public static Map getSparkSchedulingConfigs(HoodieDeltaStreamer. && cfg.continuousMode && cfg.tableType.equals(HoodieTableType.MERGE_ON_READ.name())) { String sparkSchedulingConfFile = generateAndStoreConfig(cfg.deltaSyncSchedulingWeight, cfg.compactSchedulingWeight, cfg.deltaSyncSchedulingMinShare, cfg.compactSchedulingMinShare); - additionalSparkConfigs.put(SPARK_SCHEDULER_ALLOCATION_FILE_KEY, sparkSchedulingConfFile); + additionalSparkConfigs.put(SparkConfigs.SPARK_SCHEDULER_ALLOCATION_FILE_KEY(), sparkSchedulingConfFile); } else { LOG.warn("Job Scheduling Configs will not be in effect as spark.scheduler.mode " + "is not set to FAIR at instantiation time. Continuing without scheduling configs"); diff --git a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/RowColumnVector.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/exception/HoodieSchemaProviderException.java similarity index 66% rename from hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/RowColumnVector.java rename to hudi-utilities/src/main/java/org/apache/hudi/utilities/exception/HoodieSchemaProviderException.java index 293af7b9cf2eb..26b6f53b052ae 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/table/format/cow/vector/RowColumnVector.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/exception/HoodieSchemaProviderException.java @@ -7,7 +7,7 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,15 +16,17 @@ * limitations under the License. */ -package org.apache.hudi.table.format.cow.vector; +package org.apache.hudi.utilities.exception; -import org.apache.hudi.table.format.cow.data.ColumnarRowData; +import org.apache.hudi.exception.HoodieException; -import org.apache.flink.table.data.vector.ColumnVector; +public class HoodieSchemaProviderException extends HoodieException { -/** - * Row column vector. - */ -public interface RowColumnVector extends ColumnVector { - ColumnarRowData getRow(int i); -} \ No newline at end of file + public HoodieSchemaProviderException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieSchemaProviderException(String msg) { + super(msg); + } +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/exception/HoodieSourceTimeoutException.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/exception/HoodieSourceTimeoutException.java new file mode 100644 index 0000000000000..d95f4f4b5e808 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/exception/HoodieSourceTimeoutException.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities.exception; + +import org.apache.hudi.exception.HoodieException; + +public class HoodieSourceTimeoutException extends HoodieException { + + public HoodieSourceTimeoutException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieSourceTimeoutException(String msg) { + super(msg); + } +} \ No newline at end of file diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java index ac15897f5785c..d992976da2b5a 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/perf/TimelineServerPerf.java @@ -74,7 +74,7 @@ public class TimelineServerPerf implements Serializable { public TimelineServerPerf(Config cfg) throws IOException { this.cfg = cfg; useExternalTimelineServer = (cfg.serverHost != null); - TimelineService.Config timelineServiceConf = cfg.getTimelinServerConfig(); + TimelineService.Config timelineServiceConf = cfg.getTimelineServerConfig(); this.timelineServer = new TimelineService( new HoodieLocalEngineContext(FSUtils.prepareHadoopConf(new Configuration())), new Configuration(), timelineServiceConf, FileSystem.get(new Configuration()), @@ -281,7 +281,7 @@ public static class Config implements Serializable { description = " Server Host (Set it for externally managed timeline service") public String serverHost = null; - @Parameter(names = {"--view-storage", "-st"}, description = "View Storage Type. Defaut - SPILLABLE_DISK") + @Parameter(names = {"--view-storage", "-st"}, description = "View Storage Type. Default - SPILLABLE_DISK") public FileSystemViewStorageType viewStorageType = FileSystemViewStorageType.SPILLABLE_DISK; @Parameter(names = {"--max-view-mem-per-table", "-mv"}, @@ -310,7 +310,7 @@ public static class Config implements Serializable { @Parameter(names = {"--help", "-h"}) public Boolean help = false; - public TimelineService.Config getTimelinServerConfig() { + public TimelineService.Config getTimelineServerConfig() { TimelineService.Config c = new TimelineService.Config(); c.viewStorageType = viewStorageType; c.baseStorePathForFileGroups = baseStorePathForFileGroups; diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/HiveSchemaProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/HiveSchemaProvider.java index 219b1ae57886d..9fca2a241a66b 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/HiveSchemaProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/HiveSchemaProvider.java @@ -19,12 +19,12 @@ package org.apache.hudi.utilities.schema; -import org.apache.avro.Schema; import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.DataSourceUtils; import org.apache.hudi.common.config.TypedProperties; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; +import org.apache.hudi.utilities.exception.HoodieSchemaProviderException; + +import org.apache.avro.Schema; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.TableIdentifier; @@ -34,6 +34,9 @@ import java.util.Collections; +/** + * A schema provider to get data schema through user specified hive table. + */ public class HiveSchemaProvider extends SchemaProvider { /** @@ -46,40 +49,42 @@ public static class Config { private static final String TARGET_SCHEMA_TABLE_PROP = "hoodie.deltastreamer.schemaprovider.target.schema.hive.table"; } - private static final Logger LOG = LogManager.getLogger(HiveSchemaProvider.class); - private final Schema sourceSchema; - private Schema targetSchema; public HiveSchemaProvider(TypedProperties props, JavaSparkContext jssc) { super(props, jssc); DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SOURCE_SCHEMA_TABLE_PROP)); - String sourceSchemaDBName = props.getString(Config.SOURCE_SCHEMA_DATABASE_PROP, "default"); + String sourceSchemaDatabaseName = props.getString(Config.SOURCE_SCHEMA_DATABASE_PROP, "default"); String sourceSchemaTableName = props.getString(Config.SOURCE_SCHEMA_TABLE_PROP); SparkSession spark = SparkSession.builder().config(jssc.getConf()).enableHiveSupport().getOrCreate(); + + // source schema try { - TableIdentifier sourceSchemaTable = new TableIdentifier(sourceSchemaTableName, scala.Option.apply(sourceSchemaDBName)); + TableIdentifier sourceSchemaTable = new TableIdentifier(sourceSchemaTableName, scala.Option.apply(sourceSchemaDatabaseName)); StructType sourceSchema = spark.sessionState().catalog().getTableMetadata(sourceSchemaTable).schema(); - this.sourceSchema = AvroConversionUtils.convertStructTypeToAvroSchema( - sourceSchema, - sourceSchemaTableName, - "hoodie." + sourceSchemaDBName); + sourceSchema, + sourceSchemaTableName, + "hoodie." + sourceSchemaDatabaseName); + } catch (NoSuchTableException | NoSuchDatabaseException e) { + throw new HoodieSchemaProviderException(String.format("Can't find Hive table: %s.%s", sourceSchemaDatabaseName, sourceSchemaTableName), e); + } - if (props.containsKey(Config.TARGET_SCHEMA_TABLE_PROP)) { - String targetSchemaDBName = props.getString(Config.TARGET_SCHEMA_DATABASE_PROP, "default"); - String targetSchemaTableName = props.getString(Config.TARGET_SCHEMA_TABLE_PROP); - TableIdentifier targetSchemaTable = new TableIdentifier(targetSchemaTableName, scala.Option.apply(targetSchemaDBName)); + // target schema + if (props.containsKey(Config.TARGET_SCHEMA_TABLE_PROP)) { + String targetSchemaDatabaseName = props.getString(Config.TARGET_SCHEMA_DATABASE_PROP, "default"); + String targetSchemaTableName = props.getString(Config.TARGET_SCHEMA_TABLE_PROP); + try { + TableIdentifier targetSchemaTable = new TableIdentifier(targetSchemaTableName, scala.Option.apply(targetSchemaDatabaseName)); StructType targetSchema = spark.sessionState().catalog().getTableMetadata(targetSchemaTable).schema(); this.targetSchema = AvroConversionUtils.convertStructTypeToAvroSchema( - targetSchema, - targetSchemaTableName, - "hoodie." + targetSchemaDBName); + targetSchema, + targetSchemaTableName, + "hoodie." + targetSchemaDatabaseName); + } catch (NoSuchDatabaseException | NoSuchTableException e) { + throw new HoodieSchemaProviderException(String.format("Can't find Hive table: %s.%s", targetSchemaDatabaseName, targetSchemaTableName), e); } - } catch (NoSuchTableException | NoSuchDatabaseException e) { - String message = String.format("Can't find Hive table(s): %s", sourceSchemaTableName + "," + props.getString(Config.TARGET_SCHEMA_TABLE_PROP)); - throw new IllegalArgumentException(message, e); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java index 216369296ad53..1046eac975968 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java @@ -50,7 +50,7 @@ public class SchemaRegistryProvider extends SchemaProvider { public static class Config { public static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url"; - private static final String TARGET_SCHEMA_REGISTRY_URL_PROP = + public static final String TARGET_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.targetUrl"; } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java index ff8ea5a7aa2da..84c6fd815e838 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/AvroKafkaSource.java @@ -25,6 +25,7 @@ import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics; import org.apache.hudi.utilities.deser.KafkaAvroSchemaDeserializer; +import org.apache.hudi.utilities.exception.HoodieSourceTimeoutException; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.AvroConvertor; import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen; @@ -64,12 +65,12 @@ public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, Spa SchemaProvider schemaProvider, HoodieDeltaStreamerMetrics metrics) { super(props, sparkContext, sparkSession, schemaProvider); - props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class); + props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class.getName()); deserializerClassName = props.getString(DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().key(), DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().defaultValue()); try { - props.put(NATIVE_KAFKA_VALUE_DESERIALIZER_PROP, Class.forName(deserializerClassName)); + props.put(NATIVE_KAFKA_VALUE_DESERIALIZER_PROP, Class.forName(deserializerClassName).getName()); if (deserializerClassName.equals(KafkaAvroSchemaDeserializer.class.getName())) { if (schemaProvider == null) { throw new HoodieIOException("SchemaProvider has to be set to use KafkaAvroSchemaDeserializer"); @@ -89,14 +90,18 @@ public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, Spa @Override protected InputBatch> fetchNewData(Option lastCheckpointStr, long sourceLimit) { - OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics); - long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); - LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); - if (totalNewMsgs <= 0) { - return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges)); + try { + OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics); + long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); + LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); + if (totalNewMsgs <= 0) { + return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges)); + } + JavaRDD newDataRDD = toRDD(offsetRanges); + return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges)); + } catch (org.apache.kafka.common.errors.TimeoutException e) { + throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage()); } - JavaRDD newDataRDD = toRDD(offsetRanges); - return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges)); } private JavaRDD toRDD(OffsetRange[] offsetRanges) { diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java index ebb359390be0c..aa1e261c250b5 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java @@ -31,7 +31,6 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.DataFrameReader; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; @@ -123,21 +122,34 @@ public Pair>, String> fetchNextBatch(Option lastCkpt Option beginInstant = lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty(); - Pair instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath, + Pair> queryTypeAndInstantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath, numInstantsPerFetch, beginInstant, missingCheckpointStrategy); - if (instantEndpts.getKey().equals(instantEndpts.getValue())) { - LOG.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey()); - return Pair.of(Option.empty(), instantEndpts.getKey()); + if (queryTypeAndInstantEndpts.getValue().getKey().equals(queryTypeAndInstantEndpts.getValue().getValue())) { + LOG.warn("Already caught up. Begin Checkpoint was :" + queryTypeAndInstantEndpts.getValue().getKey()); + return Pair.of(Option.empty(), queryTypeAndInstantEndpts.getValue().getKey()); } + Dataset source = null; // Do Incr pull. Set end instant if available - DataFrameReader reader = sparkSession.read().format("org.apache.hudi") - .option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) - .option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), instantEndpts.getLeft()) - .option(DataSourceReadOptions.END_INSTANTTIME().key(), instantEndpts.getRight()); - - Dataset source = reader.load(srcPath); + if (queryTypeAndInstantEndpts.getKey().equals(DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())) { + source = sparkSession.read().format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), queryTypeAndInstantEndpts.getValue().getLeft()) + .option(DataSourceReadOptions.END_INSTANTTIME().key(), queryTypeAndInstantEndpts.getValue().getRight()) + .option(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES().key(), + props.getString(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES().key(), + DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES().defaultValue())) + .load(srcPath); + } else { + // if checkpoint is missing from source table, and if strategy is set to READ_UPTO_LATEST_COMMIT, we have to issue snapshot query + source = sparkSession.read().format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()) + .load(srcPath) + // add filtering so that only interested records are returned. + .filter(String.format("%s > '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + queryTypeAndInstantEndpts.getRight().getLeft())); + } /* * log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema()); @@ -165,6 +177,6 @@ public Pair>, String> fetchNextBatch(Option lastCkpt final Dataset src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream() .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new)); // log.info("Final Schema from Source is :" + src.schema()); - return Pair.of(Option.of(src), instantEndpts.getRight()); + return Pair.of(Option.of(src), queryTypeAndInstantEndpts.getRight().getRight()); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java index 3dfc611000a35..d6152a177f7fd 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/JsonKafkaSource.java @@ -20,7 +20,9 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics; +import org.apache.hudi.utilities.exception.HoodieSourceTimeoutException; import org.apache.hudi.utilities.schema.SchemaProvider; import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen; import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils; @@ -35,8 +37,6 @@ import org.apache.spark.streaming.kafka010.LocationStrategies; import org.apache.spark.streaming.kafka010.OffsetRange; -import java.util.Objects; - /** * Read json kafka data. */ @@ -52,21 +52,25 @@ public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext SchemaProvider schemaProvider, HoodieDeltaStreamerMetrics metrics) { super(properties, sparkContext, sparkSession, schemaProvider); this.metrics = metrics; - properties.put("key.deserializer", StringDeserializer.class); - properties.put("value.deserializer", StringDeserializer.class); + properties.put("key.deserializer", StringDeserializer.class.getName()); + properties.put("value.deserializer", StringDeserializer.class.getName()); offsetGen = new KafkaOffsetGen(properties); } @Override protected InputBatch> fetchNewData(Option lastCheckpointStr, long sourceLimit) { - OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics); - long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); - LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); - if (totalNewMsgs <= 0) { - return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges)); + try { + OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics); + long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); + LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); + if (totalNewMsgs <= 0) { + return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges)); + } + JavaRDD newDataRDD = toRDD(offsetRanges); + return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges)); + } catch (org.apache.kafka.common.errors.TimeoutException e) { + throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage()); } - JavaRDD newDataRDD = toRDD(offsetRanges); - return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges)); } private JavaRDD toRDD(OffsetRange[] offsetRanges) { @@ -74,7 +78,7 @@ private JavaRDD toRDD(OffsetRange[] offsetRanges) { offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()) - .filter(x -> Objects.nonNull(x.value())) + .filter(x -> !StringUtils.isNullOrEmpty((String)x.value())) .map(x -> x.value().toString()); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/S3EventsHoodieIncrSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/S3EventsHoodieIncrSource.java index 434e14a80453d..2f7d9898b95b0 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/S3EventsHoodieIncrSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/S3EventsHoodieIncrSource.java @@ -22,6 +22,7 @@ import org.apache.hudi.DataSourceUtils; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.collection.Pair; @@ -33,7 +34,6 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.DataFrameReader; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; @@ -101,34 +101,43 @@ public Pair>, String> fetchNextBatch(Option lastCkpt ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty(); - Pair instantEndpts = + Pair> queryTypeAndInstantEndpts = IncrSourceHelper.calculateBeginAndEndInstants( sparkContext, srcPath, numInstantsPerFetch, beginInstant, missingCheckpointStrategy); - if (instantEndpts.getKey().equals(instantEndpts.getValue())) { - LOG.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey()); - return Pair.of(Option.empty(), instantEndpts.getKey()); + if (queryTypeAndInstantEndpts.getValue().getKey().equals(queryTypeAndInstantEndpts.getValue().getValue())) { + LOG.warn("Already caught up. Begin Checkpoint was :" + queryTypeAndInstantEndpts.getValue().getKey()); + return Pair.of(Option.empty(), queryTypeAndInstantEndpts.getValue().getKey()); } + Dataset source = null; // Do incremental pull. Set end instant if available. - DataFrameReader metaReader = sparkSession.read().format("org.apache.hudi") - .option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) - .option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), instantEndpts.getLeft()) - .option(DataSourceReadOptions.END_INSTANTTIME().key(), instantEndpts.getRight()); - Dataset source = metaReader.load(srcPath); + if (queryTypeAndInstantEndpts.getKey().equals(DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())) { + source = sparkSession.read().format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), queryTypeAndInstantEndpts.getRight().getLeft()) + .option(DataSourceReadOptions.END_INSTANTTIME().key(), queryTypeAndInstantEndpts.getRight().getRight()).load(srcPath); + } else { + // if checkpoint is missing from source table, and if strategy is set to READ_UPTO_LATEST_COMMIT, we have to issue snapshot query + source = sparkSession.read().format("org.apache.hudi") + .option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()).load(srcPath) + // add filtering so that only interested records are returned. + .filter(String.format("%s > '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, + queryTypeAndInstantEndpts.getRight().getLeft())); + } if (source.isEmpty()) { - return Pair.of(Option.empty(), instantEndpts.getRight()); + return Pair.of(Option.empty(), queryTypeAndInstantEndpts.getRight().getRight()); } String filter = "s3.object.size > 0"; - if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_KEY_PREFIX))) { + if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_KEY_PREFIX, null))) { filter = filter + " and s3.object.key like '" + props.getString(Config.S3_KEY_PREFIX) + "%'"; } - if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_IGNORE_KEY_PREFIX))) { + if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_IGNORE_KEY_PREFIX, null))) { filter = filter + " and s3.object.key not like '" + props.getString(Config.S3_IGNORE_KEY_PREFIX) + "%'"; } - if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_IGNORE_KEY_SUBSTRING))) { + if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_IGNORE_KEY_SUBSTRING, null))) { filter = filter + " and s3.object.key not like '%" + props.getString(Config.S3_IGNORE_KEY_SUBSTRING) + "%'"; } // add file format filtering by default @@ -167,6 +176,6 @@ public Pair>, String> fetchNextBatch(Option lastCkpt if (!cloudFiles.isEmpty()) { dataset = Option.of(sparkSession.read().format(fileFormat).load(cloudFiles.toArray(new String[0]))); } - return Pair.of(dataset, instantEndpts.getRight()); + return Pair.of(dataset, queryTypeAndInstantEndpts.getRight().getRight()); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/SqlSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/SqlSource.java index d832e43d2ae0b..056e035175937 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/SqlSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/SqlSource.java @@ -48,6 +48,8 @@ *

    To fetch and use the latest incremental checkpoint, you need to also set this hoodie_conf for deltastremer jobs: * *

    hoodie.write.meta.key.prefixes = 'deltastreamer.checkpoint.key' + * + * Also, users are expected to set --allow-commit-on-no-checkpoint-change while using this SqlSource. */ public class SqlSource extends RowSource { private static final long serialVersionUID = 1L; diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java index 7018419c2d6de..d9be692b5bc57 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java @@ -82,12 +82,12 @@ public DebeziumSource(TypedProperties props, JavaSparkContext sparkContext, HoodieDeltaStreamerMetrics metrics) { super(props, sparkContext, sparkSession, schemaProvider); - props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class); + props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class.getName()); deserializerClassName = props.getString(DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().key(), DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().defaultValue()); try { - props.put(NATIVE_KAFKA_VALUE_DESERIALIZER_PROP, Class.forName(deserializerClassName)); + props.put(NATIVE_KAFKA_VALUE_DESERIALIZER_PROP, Class.forName(deserializerClassName).getName()); } catch (ClassNotFoundException e) { String error = "Could not load custom avro kafka deserializer: " + deserializerClassName; LOG.error(error); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java index a370c314a168f..cbfb153ee9ca4 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java @@ -18,6 +18,7 @@ package org.apache.hudi.utilities.sources.helpers; +import org.apache.hudi.DataSourceReadOptions; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -63,10 +64,10 @@ private static String getStrictlyLowerTimestamp(String timestamp) { * @param numInstantsPerFetch Max Instants per fetch * @param beginInstant Last Checkpoint String * @param missingCheckpointStrategy when begin instant is missing, allow reading based on missing checkpoint strategy - * @return begin and end instants + * @return begin and end instants along with query type. */ - public static Pair calculateBeginAndEndInstants(JavaSparkContext jssc, String srcBasePath, - int numInstantsPerFetch, Option beginInstant, MissingCheckpointStrategy missingCheckpointStrategy) { + public static Pair> calculateBeginAndEndInstants(JavaSparkContext jssc, String srcBasePath, + int numInstantsPerFetch, Option beginInstant, MissingCheckpointStrategy missingCheckpointStrategy) { ValidationUtils.checkArgument(numInstantsPerFetch > 0, "Make sure the config hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value"); HoodieTableMetaClient srcMetaClient = HoodieTableMetaClient.builder().setConf(jssc.hadoopConfiguration()).setBasePath(srcBasePath).setLoadActiveTimelineOnLoad(true).build(); @@ -88,15 +89,14 @@ public static Pair calculateBeginAndEndInstants(JavaSparkContext } }); - if (!beginInstantTime.equals(DEFAULT_BEGIN_TIMESTAMP)) { + if (missingCheckpointStrategy == MissingCheckpointStrategy.READ_LATEST || !activeCommitTimeline.isBeforeTimelineStarts(beginInstantTime)) { Option nthInstant = Option.fromJavaOptional(activeCommitTimeline .findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y)); - return Pair.of(beginInstantTime, nthInstant.map(HoodieInstant::getTimestamp).orElse(beginInstantTime)); + return Pair.of(DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL(), Pair.of(beginInstantTime, nthInstant.map(HoodieInstant::getTimestamp).orElse(beginInstantTime))); } else { - // if beginInstant is DEFAULT_BEGIN_TIMESTAMP, MissingCheckpointStrategy should be set. - // when MissingCheckpointStrategy is set to read everything until latest. + // when MissingCheckpointStrategy is set to read everything until latest, trigger snapshot query. Option lastInstant = activeCommitTimeline.lastInstant(); - return Pair.of(beginInstantTime, lastInstant.get().getTimestamp()); + return Pair.of(DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL(), Pair.of(beginInstantTime, lastInstant.get().getTimestamp())); } } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/util/BloomFilterData.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/util/BloomFilterData.java new file mode 100644 index 0000000000000..1d4f0539136b4 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/util/BloomFilterData.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities.util; + +import org.jetbrains.annotations.NotNull; + +import java.nio.ByteBuffer; +import java.util.Objects; + +/** + * Includes partition path, filename and bloom filter for validation + */ +public class BloomFilterData implements Comparable { + private final String partitionPath; + private final String filename; + private final ByteBuffer bloomFilter; + + private BloomFilterData( + String partitionPath, String filename, ByteBuffer bloomFilter) { + this.partitionPath = partitionPath; + this.filename = filename; + this.bloomFilter = bloomFilter; + } + + public static Builder builder() { + return new Builder(); + } + + @Override + public int compareTo(@NotNull BloomFilterData o) { + return this.toString().compareTo(o.toString()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + BloomFilterData that = (BloomFilterData) o; + return partitionPath.equals(that.partitionPath) && filename.equals(that.filename) + && bloomFilter.equals(that.bloomFilter); + } + + @Override + public int hashCode() { + return Objects.hash(partitionPath, filename, bloomFilter); + } + + @Override + public String toString() { + String bloomFilterString = new String(bloomFilter.array()); + return "BloomFilterData{" + + "partitionPath='" + partitionPath + '\'' + + ", filename='" + filename + '\'' + + ", bloomFilter=" + + (bloomFilterString.length() > 50 ? bloomFilterString.substring(0, 50) + "..." : bloomFilterString) + + '}'; + } + + public static class Builder { + private String partitionPath; + private String filename; + private ByteBuffer bloomFilter; + + public Builder setPartitionPath(String partitionPath) { + this.partitionPath = partitionPath; + return this; + } + + public Builder setFilename(String filename) { + this.filename = filename; + return this; + } + + public Builder setBloomFilter(ByteBuffer bloomFilter) { + this.bloomFilter = bloomFilter; + return this; + } + + public BloomFilterData build() { + return new BloomFilterData(partitionPath, filename, bloomFilter); + } + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestSchedulerConfGenerator.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestSchedulerConfGenerator.java index 77fd04fb4bdd3..26b8bc1c88580 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestSchedulerConfGenerator.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/deltastreamer/TestSchedulerConfGenerator.java @@ -18,6 +18,7 @@ package org.apache.hudi.utilities.deltastreamer; +import org.apache.hudi.SparkConfigs; import org.apache.hudi.common.model.HoodieTableType; import org.junit.jupiter.api.Test; @@ -33,21 +34,21 @@ public class TestSchedulerConfGenerator { public void testGenerateSparkSchedulingConf() throws Exception { HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config(); Map configs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg); - assertNull(configs.get(SchedulerConfGenerator.SPARK_SCHEDULER_ALLOCATION_FILE_KEY), "spark.scheduler.mode not set"); + assertNull(configs.get(SparkConfigs.SPARK_SCHEDULER_ALLOCATION_FILE_KEY()), "spark.scheduler.mode not set"); System.setProperty(SchedulerConfGenerator.SPARK_SCHEDULER_MODE_KEY, "FAIR"); cfg.continuousMode = false; configs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg); - assertNull(configs.get(SchedulerConfGenerator.SPARK_SCHEDULER_ALLOCATION_FILE_KEY), "continuousMode is false"); + assertNull(configs.get(SparkConfigs.SPARK_SCHEDULER_ALLOCATION_FILE_KEY()), "continuousMode is false"); cfg.continuousMode = true; cfg.tableType = HoodieTableType.COPY_ON_WRITE.name(); configs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg); - assertNull(configs.get(SchedulerConfGenerator.SPARK_SCHEDULER_ALLOCATION_FILE_KEY), + assertNull(configs.get(SparkConfigs.SPARK_SCHEDULER_ALLOCATION_FILE_KEY()), "table type is not MERGE_ON_READ"); cfg.tableType = HoodieTableType.MERGE_ON_READ.name(); configs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg); - assertNotNull(configs.get(SchedulerConfGenerator.SPARK_SCHEDULER_ALLOCATION_FILE_KEY), "all satisfies"); + assertNotNull(configs.get(SparkConfigs.SPARK_SCHEDULER_ALLOCATION_FILE_KEY()), "all satisfies"); } } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java index b7e6f1870df44..02b1848e2e31e 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/HoodieDeltaStreamerTestBase.java @@ -61,6 +61,7 @@ public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase { static final String PROPS_FILENAME_TEST_PARQUET = "test-parquet-dfs-source.properties"; static final String PROPS_FILENAME_TEST_ORC = "test-orc-dfs-source.properties"; static final String PROPS_FILENAME_TEST_JSON_KAFKA = "test-json-kafka-dfs-source.properties"; + static final String PROPS_FILENAME_TEST_SQL_SOURCE = "test-sql-source-source.properties"; static final String PROPS_FILENAME_TEST_MULTI_WRITER = "test-multi-writer.properties"; static final String FIRST_PARQUET_FILE_NAME = "1.parquet"; static final String FIRST_ORC_FILE_NAME = "1.orc"; @@ -71,6 +72,7 @@ public class HoodieDeltaStreamerTestBase extends UtilitiesTestBase { static final int ORC_NUM_RECORDS = 5; static final int CSV_NUM_RECORDS = 3; static final int JSON_KAFKA_NUM_RECORDS = 5; + static final int SQL_SOURCE_NUM_RECORDS = 1000; String kafkaCheckpointType = "string"; // Required fields static final String TGT_BASE_PATH_PARAM = "--target-base-path"; @@ -171,7 +173,7 @@ protected static void writeCommonPropsToFile(FileSystem dfs, String dfsBasePath) props.setProperty("include", "sql-transformer.properties"); props.setProperty("hoodie.datasource.write.keygenerator.class", TestHoodieDeltaStreamer.TestGenerator.class.getName()); props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); - props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); + props.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path"); props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); @@ -279,35 +281,35 @@ protected static void prepareORCDFSFiles(int numRecords, String baseORCPath, Str HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); if (useCustomSchema) { Helpers.saveORCToDFS(Helpers.toGenericRecords( - dataGenerator.generateInsertsAsPerSchema("000", numRecords, schemaStr), - schema), new Path(path), HoodieTestDataGenerator.ORC_TRIP_SCHEMA); + dataGenerator.generateInsertsAsPerSchema("000", numRecords, schemaStr), + schema), new Path(path), HoodieTestDataGenerator.ORC_TRIP_SCHEMA); } else { Helpers.saveORCToDFS(Helpers.toGenericRecords( - dataGenerator.generateInserts("000", numRecords)), new Path(path)); + dataGenerator.generateInserts("000", numRecords)), new Path(path)); } } - static void addCommitToTimeline(HoodieTableMetaClient metaCient) throws IOException { - addCommitToTimeline(metaCient, Collections.emptyMap()); + static void addCommitToTimeline(HoodieTableMetaClient metaClient) throws IOException { + addCommitToTimeline(metaClient, Collections.emptyMap()); } - static void addCommitToTimeline(HoodieTableMetaClient metaCient, Map extraMetadata) throws IOException { - addCommitToTimeline(metaCient, WriteOperationType.UPSERT, HoodieTimeline.COMMIT_ACTION, extraMetadata); + static void addCommitToTimeline(HoodieTableMetaClient metaClient, Map extraMetadata) throws IOException { + addCommitToTimeline(metaClient, WriteOperationType.UPSERT, HoodieTimeline.COMMIT_ACTION, extraMetadata); } - static void addReplaceCommitToTimeline(HoodieTableMetaClient metaCient, Map extraMetadata) throws IOException { - addCommitToTimeline(metaCient, WriteOperationType.CLUSTER, HoodieTimeline.REPLACE_COMMIT_ACTION, extraMetadata); + static void addReplaceCommitToTimeline(HoodieTableMetaClient metaClient, Map extraMetadata) throws IOException { + addCommitToTimeline(metaClient, WriteOperationType.CLUSTER, HoodieTimeline.REPLACE_COMMIT_ACTION, extraMetadata); } - static void addCommitToTimeline(HoodieTableMetaClient metaCient, WriteOperationType writeOperationType, String commitActiontype, + static void addCommitToTimeline(HoodieTableMetaClient metaClient, WriteOperationType writeOperationType, String commitActiontype, Map extraMetadata) throws IOException { HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); commitMetadata.setOperationType(writeOperationType); - extraMetadata.forEach((k,v) -> commitMetadata.getExtraMetadata().put(k, v)); + extraMetadata.forEach((k, v) -> commitMetadata.getExtraMetadata().put(k, v)); String commitTime = HoodieActiveTimeline.createNewInstantTime(); - metaCient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.REQUESTED, commitActiontype, commitTime)); - metaCient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime)); - metaCient.getActiveTimeline().saveAsComplete( + metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.REQUESTED, commitActiontype, commitTime)); + metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime)); + metaClient.getActiveTimeline().saveAsComplete( new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8))); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHDFSParquetImporter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHDFSParquetImporter.java index 3ac490bf9163e..a57be62461d45 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHDFSParquetImporter.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHDFSParquetImporter.java @@ -233,8 +233,7 @@ public List createInsertRecords(Path srcFolder) throws ParseExcep long startTime = HoodieActiveTimeline.parseDateFromInstantTime("20170203000000").getTime() / 1000; List records = new ArrayList(); for (long recordNum = 0; recordNum < 96; recordNum++) { - records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "0", "rider-" + recordNum, - "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum))); + records.add(new HoodieTestDataGenerator().generateGenericRecord(Long.toString(recordNum), "0", "rider-" + recordNum, "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum))); } try (ParquetWriter writer = AvroParquetWriter.builder(srcFile) .withSchema(HoodieTestDataGenerator.AVRO_SCHEMA).withConf(HoodieTestUtils.getDefaultHadoopConf()).build()) { @@ -251,12 +250,12 @@ public List createUpsertRecords(Path srcFolder) throws ParseExcep List records = new ArrayList(); // 10 for update for (long recordNum = 0; recordNum < 11; recordNum++) { - records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "0", "rider-upsert-" + recordNum, + records.add(new HoodieTestDataGenerator().generateGenericRecord(Long.toString(recordNum), "0", "rider-upsert-" + recordNum, "driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum))); } // 4 for insert for (long recordNum = 96; recordNum < 100; recordNum++) { - records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "0", "rider-upsert-" + recordNum, + records.add(new HoodieTestDataGenerator().generateGenericRecord(Long.toString(recordNum), "0", "rider-upsert-" + recordNum, "driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum))); } try (ParquetWriter writer = AvroParquetWriter.builder(srcFile) diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java index 1874991888cbf..1c80896586515 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamer.java @@ -19,18 +19,22 @@ package org.apache.hudi.utilities.functional; import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.DataSourceReadOptions; import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.transaction.lock.InProcessLockProvider; import org.apache.hudi.common.config.DFSPropertiesConfiguration; import org.apache.hudi.common.config.HoodieConfig; import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieReplaceCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -43,6 +47,7 @@ import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieClusteringConfig; import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieLockConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.TableNotFoundException; @@ -63,6 +68,7 @@ import org.apache.hudi.utilities.sources.JsonKafkaSource; import org.apache.hudi.utilities.sources.ORCDFSSource; import org.apache.hudi.utilities.sources.ParquetDFSSource; +import org.apache.hudi.utilities.sources.SqlSource; import org.apache.hudi.utilities.sources.TestDataSource; import org.apache.hudi.utilities.testutils.JdbcTestUtils; import org.apache.hudi.utilities.testutils.UtilitiesTestBase; @@ -207,6 +213,14 @@ static HoodieDeltaStreamer.Config makeConfig(String basePath, WriteOperationType List transformerClassNames, String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass, int sourceLimit, boolean updatePayloadClass, String payloadClassName, String tableType, String sourceOrderingField, String checkpoint) { + return makeConfig(basePath, op, sourceClassName, transformerClassNames, propsFilename, enableHiveSync, useSchemaProviderClass, sourceLimit, updatePayloadClass, payloadClassName, + tableType, sourceOrderingField, checkpoint, false); + } + + static HoodieDeltaStreamer.Config makeConfig(String basePath, WriteOperationType op, String sourceClassName, + List transformerClassNames, String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass, + int sourceLimit, boolean updatePayloadClass, String payloadClassName, String tableType, String sourceOrderingField, + String checkpoint, boolean allowCommitOnNoCheckpointChange) { HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config(); cfg.targetBasePath = basePath; cfg.targetTableName = "hoodie_trips"; @@ -225,6 +239,7 @@ static HoodieDeltaStreamer.Config makeConfig(String basePath, WriteOperationType if (useSchemaProviderClass) { cfg.schemaProviderClassName = defaultSchemaProviderClassName; } + cfg.allowCommitOnNoCheckpointChange = allowCommitOnNoCheckpointChange; return cfg; } @@ -257,6 +272,19 @@ static void assertRecordCount(long expected, String tablePath, SQLContext sqlCon assertEquals(expected, recordCount); } + static Map getPartitionRecordCount(String basePath, SQLContext sqlContext) { + sqlContext.clearCache(); + List rows = sqlContext.read().format("org.apache.hudi").load(basePath).groupBy(HoodieRecord.PARTITION_PATH_METADATA_FIELD).count().collectAsList(); + Map partitionRecordCount = new HashMap<>(); + rows.stream().forEach(row -> partitionRecordCount.put(row.getString(0), row.getLong(1))); + return partitionRecordCount; + } + + static void assertNoPartitionMatch(String basePath, SQLContext sqlContext, String partitionToValidate) { + sqlContext.clearCache(); + assertEquals(0, sqlContext.read().format("org.apache.hudi").load(basePath).filter(HoodieRecord.PARTITION_PATH_METADATA_FIELD + " = " + partitionToValidate).count()); + } + static void assertDistinctRecordCount(long expected, String tablePath, SQLContext sqlContext) { sqlContext.clearCache(); long recordCount = sqlContext.read().format("org.apache.hudi").load(tablePath).select("_hoodie_record_key").distinct().count(); @@ -754,6 +782,38 @@ public void testInlineClustering(String preserveCommitMetadata) throws Exception }); } + @Test + public void testDeltaSyncWithPendingClustering() throws Exception { + String tableBasePath = dfsBasePath + "/inlineClusteringPending"; + // ingest data + int totalRecords = 2000; + HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT); + cfg.continuousMode = false; + cfg.tableType = HoodieTableType.COPY_ON_WRITE.name(); + HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc); + ds.sync(); + // assert ingest successful + TestHelpers.assertAtLeastNCommits(1, tableBasePath, dfs); + + // schedule a clustering job to build a clustering plan and transition to inflight + HoodieClusteringJob clusteringJob = initialHoodieClusteringJob(tableBasePath, null, false, "schedule"); + clusteringJob.cluster(0); + HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build(); + List hoodieClusteringInstants = meta.getActiveTimeline().filterPendingReplaceTimeline().getInstants().collect(Collectors.toList()); + HoodieInstant clusteringRequest = hoodieClusteringInstants.get(0); + meta.getActiveTimeline().transitionReplaceRequestedToInflight(clusteringRequest, Option.empty()); + + // do another ingestion with inline clustering enabled + cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", "")); + cfg.retryLastPendingInlineClusteringJob = true; + HoodieDeltaStreamer ds2 = new HoodieDeltaStreamer(cfg, jsc); + ds2.sync(); + String completeClusteringTimeStamp = meta.reloadActiveTimeline().getCompletedReplaceTimeline().lastInstant().get().getTimestamp(); + assertEquals(clusteringRequest.getTimestamp(), completeClusteringTimeStamp); + TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs); + TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs); + } + @ParameterizedTest @ValueSource(booleans = {true, false}) public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean) throws Exception { @@ -768,7 +828,7 @@ public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean) throws cfg.tableType = HoodieTableType.COPY_ON_WRITE.name(); cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", "")); cfg.configs.add(String.format("%s=%s", HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), "0")); - cfg.configs.add(HoodieMetadataConfig.ENABLE.key() + "=false"); + cfg.configs.add(String.format("%s=%s", HoodieMetadataConfig.COMPACT_NUM_DELTA_COMMITS.key(), "1")); HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc); deltaStreamerTestRunner(ds, cfg, (r) -> { TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs); @@ -819,8 +879,16 @@ public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean) throws configs.add(String.format("%s=%s", HoodieCompactionConfig.CLEANER_COMMITS_RETAINED.key(), "1")); configs.add(String.format("%s=%s", HoodieCompactionConfig.MIN_COMMITS_TO_KEEP.key(), "2")); configs.add(String.format("%s=%s", HoodieCompactionConfig.MAX_COMMITS_TO_KEEP.key(), "3")); - configs.add(String.format("%s=%s", HoodieCompactionConfig.ASYNC_CLEAN, asyncClean)); - configs.add(HoodieMetadataConfig.ENABLE.key() + "=false"); + configs.add(String.format("%s=%s", HoodieCompactionConfig.ASYNC_CLEAN.key(), asyncClean)); + configs.add(String.format("%s=%s", HoodieMetadataConfig.COMPACT_NUM_DELTA_COMMITS.key(), "1")); + if (asyncClean) { + configs.add(String.format("%s=%s", HoodieWriteConfig.WRITE_CONCURRENCY_MODE.key(), + WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.name())); + configs.add(String.format("%s=%s", HoodieCompactionConfig.FAILED_WRITES_CLEANER_POLICY.key(), + HoodieFailedWritesCleaningPolicy.LAZY.name())); + configs.add(String.format("%s=%s", HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key(), + InProcessLockProvider.class.getName())); + } cfg.configs = configs; cfg.continuousMode = false; ds = new HoodieDeltaStreamer(cfg, jsc); @@ -1323,6 +1391,13 @@ private void prepareParquetDFSSource(boolean useSchemaProvider, boolean hasTrans private void prepareParquetDFSSource(boolean useSchemaProvider, boolean hasTransformer, String sourceSchemaFile, String targetSchemaFile, String propsFileName, String parquetSourceRoot, boolean addCommonProps) throws IOException { + prepareParquetDFSSource(useSchemaProvider, hasTransformer, sourceSchemaFile, targetSchemaFile, propsFileName, parquetSourceRoot, addCommonProps, + "not_there"); + } + + private void prepareParquetDFSSource(boolean useSchemaProvider, boolean hasTransformer, String sourceSchemaFile, String targetSchemaFile, + String propsFileName, String parquetSourceRoot, boolean addCommonProps, + String partitionPath) throws IOException { // Properties used for testing delta-streamer with Parquet source TypedProperties parquetProps = new TypedProperties(); @@ -1333,7 +1408,7 @@ private void prepareParquetDFSSource(boolean useSchemaProvider, boolean hasTrans parquetProps.setProperty("include", "base.properties"); parquetProps.setProperty("hoodie.embed.timeline.server", "false"); parquetProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); - parquetProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); + parquetProps.setProperty("hoodie.datasource.write.partitionpath.field", partitionPath); if (useSchemaProvider) { parquetProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/" + sourceSchemaFile); if (hasTransformer) { @@ -1668,6 +1743,41 @@ public void testCsvDFSSourceNoHeaderWithSchemaProviderAndTransformer() throws Ex testCsvDFSSource(false, '\t', true, Collections.singletonList(TripsWithDistanceTransformer.class.getName())); } + private void prepareSqlSource() throws IOException { + String sourceRoot = dfsBasePath + "sqlSourceFiles"; + TypedProperties sqlSourceProps = new TypedProperties(); + sqlSourceProps.setProperty("include", "base.properties"); + sqlSourceProps.setProperty("hoodie.embed.timeline.server", "false"); + sqlSourceProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + sqlSourceProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); + sqlSourceProps.setProperty("hoodie.deltastreamer.source.sql.sql.query","select * from test_sql_table"); + + UtilitiesTestBase.Helpers.savePropsToDFS(sqlSourceProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_SQL_SOURCE); + + // Data generation + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(); + generateSqlSourceTestTable(sourceRoot, "1", "1000", SQL_SOURCE_NUM_RECORDS, dataGenerator); + } + + private void generateSqlSourceTestTable(String dfsRoot, String filename, String instantTime, int n, HoodieTestDataGenerator dataGenerator) throws IOException { + Path path = new Path(dfsRoot, filename); + Helpers.saveParquetToDFS(Helpers.toGenericRecords(dataGenerator.generateInserts(instantTime, n, false)), path); + sparkSession.read().parquet(dfsRoot).createOrReplaceTempView("test_sql_table"); + } + + @Test + public void testSqlSourceSource() throws Exception { + prepareSqlSource(); + String tableBasePath = dfsBasePath + "/test_sql_source_table" + testNum++; + HoodieDeltaStreamer deltaStreamer = + new HoodieDeltaStreamer(TestHelpers.makeConfig( + tableBasePath, WriteOperationType.INSERT, SqlSource.class.getName(), + Collections.emptyList(), PROPS_FILENAME_TEST_SQL_SOURCE, false, + false, 1000, false, null, null, "timestamp", null, true), jsc); + deltaStreamer.sync(); + TestHelpers.assertRecordCount(SQL_SOURCE_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext); + } + @Test public void testJdbcSourceIncrementalFetchInContinuousMode() { try (Connection connection = DriverManager.getConnection("jdbc:h2:mem:test_mem", "test", "jdbc")) { @@ -1707,6 +1817,54 @@ public void testJdbcSourceIncrementalFetchInContinuousMode() { } } + @Test + public void testHoodieIncrFallback() throws Exception { + String tableBasePath = dfsBasePath + "/incr_test_table"; + String downstreamTableBasePath = dfsBasePath + "/incr_test_downstream_table"; + + insertInTable(tableBasePath, 1, WriteOperationType.BULK_INSERT); + HoodieDeltaStreamer.Config downstreamCfg = + TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, + WriteOperationType.BULK_INSERT, true, null); + new HoodieDeltaStreamer(downstreamCfg, jsc).sync(); + + insertInTable(tableBasePath, 9, WriteOperationType.UPSERT); + //No change as this fails with Path not exist error + assertThrows(org.apache.spark.sql.AnalysisException.class, () -> new HoodieDeltaStreamer(downstreamCfg, jsc).sync()); + TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*", sqlContext); + + if (downstreamCfg.configs == null) { + downstreamCfg.configs = new ArrayList<>(); + } + + downstreamCfg.configs.add(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES().key() + "=true"); + //Adding this conf to make testing easier :) + downstreamCfg.configs.add("hoodie.deltastreamer.source.hoodieincr.num_instants=10"); + downstreamCfg.operation = WriteOperationType.UPSERT; + new HoodieDeltaStreamer(downstreamCfg, jsc).sync(); + new HoodieDeltaStreamer(downstreamCfg, jsc).sync(); + + long baseTableRecords = sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*.parquet").count(); + long downStreamTableRecords = sqlContext.read().format("org.apache.hudi").load(downstreamTableBasePath + "/*/*.parquet").count(); + assertEquals(baseTableRecords, downStreamTableRecords); + } + + private void insertInTable(String tableBasePath, int count, WriteOperationType operationType) throws Exception { + HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, operationType, + Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false); + if (cfg.configs == null) { + cfg.configs = new ArrayList<>(); + } + cfg.configs.add("hoodie.cleaner.commits.retained=3"); + cfg.configs.add("hoodie.keep.min.commits=4"); + cfg.configs.add("hoodie.keep.max.commits=5"); + cfg.configs.add("hoodie.test.source.generate.inserts=true"); + + for (int i = 0; i < count; i++) { + new HoodieDeltaStreamer(cfg, jsc).sync(); + } + } + @Test public void testInsertOverwrite() throws Exception { testDeltaStreamerWithSpecifiedOperation(dfsBasePath + "/insert_overwrite", WriteOperationType.INSERT_OVERWRITE); @@ -1717,6 +1875,31 @@ public void testInsertOverwriteTable() throws Exception { testDeltaStreamerWithSpecifiedOperation(dfsBasePath + "/insert_overwrite_table", WriteOperationType.INSERT_OVERWRITE_TABLE); } + @Test + public void testDeletePartitions() throws Exception { + prepareParquetDFSSource(false, false, "source.avsc", "target.avsc", + PROPS_FILENAME_TEST_PARQUET, PARQUET_SOURCE_ROOT, false, "partition_path"); + String tableBasePath = dfsBasePath + "/test_parquet_table" + testNum; + HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer( + TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, ParquetDFSSource.class.getName(), + null, PROPS_FILENAME_TEST_PARQUET, false, + false, 100000, false, null, null, "timestamp", null), jsc); + deltaStreamer.sync(); + TestHelpers.assertRecordCount(PARQUET_NUM_RECORDS, tableBasePath, sqlContext); + testNum++; + + prepareParquetDFSFiles(PARQUET_NUM_RECORDS, PARQUET_SOURCE_ROOT); + prepareParquetDFSSource(false, false); + // set write operation to DELETE_PARTITION and add transformer to filter only for records with partition HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION + deltaStreamer = new HoodieDeltaStreamer( + TestHelpers.makeConfig(tableBasePath, WriteOperationType.DELETE_PARTITION, ParquetDFSSource.class.getName(), + Collections.singletonList(TestSpecificPartitionTransformer.class.getName()), PROPS_FILENAME_TEST_PARQUET, false, + false, 100000, false, null, null, "timestamp", null), jsc); + deltaStreamer.sync(); + // No records should match the HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION. + TestHelpers.assertNoPartitionMatch(tableBasePath, sqlContext, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH); + } + void testDeltaStreamerWithSpecifiedOperation(final String tableBasePath, WriteOperationType operationType) throws Exception { // Initial insert HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT); @@ -1863,6 +2046,16 @@ public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Datas } } + public static class TestSpecificPartitionTransformer implements Transformer { + + @Override + public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset rowDataset, + TypedProperties properties) { + Dataset toReturn = rowDataset.filter("partition_path == '" + HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH + "'"); + return toReturn; + } + } + /** * Add new field evoluted_optional_union_field with value of the field rider. */ diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerWithMultiWriter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerWithMultiWriter.java index 6a3831a960561..e383236af18a3 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerWithMultiWriter.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieDeltaStreamerWithMultiWriter.java @@ -38,6 +38,7 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Tag; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; @@ -48,15 +49,12 @@ import java.util.Collections; import java.util.ConcurrentModificationException; import java.util.List; -import java.util.Objects; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Function; -import static org.apache.hudi.common.testutils.FixtureUtils.prepareFixtureTable; -import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; import static org.apache.hudi.config.HoodieWriteConfig.BULKINSERT_PARALLELISM_VALUE; import static org.apache.hudi.config.HoodieWriteConfig.BULK_INSERT_SORT_MODE; import static org.apache.hudi.config.HoodieWriteConfig.FINALIZE_WRITE_PARALLELISM_VALUE; @@ -68,31 +66,50 @@ import static org.apache.hudi.utilities.functional.HoodieDeltaStreamerTestBase.defaultSchemaProviderClassName; import static org.apache.hudi.utilities.functional.HoodieDeltaStreamerTestBase.prepareInitialConfigs; import static org.apache.hudi.utilities.functional.TestHoodieDeltaStreamer.deltaStreamerTestRunner; -import static org.apache.hudi.utilities.testutils.sources.AbstractBaseTestSource.DEFAULT_PARTITION_NUM; -import static org.apache.hudi.utilities.testutils.sources.AbstractBaseTestSource.dataGeneratorMap; -import static org.apache.hudi.utilities.testutils.sources.AbstractBaseTestSource.initDataGen; @Tag("functional") public class TestHoodieDeltaStreamerWithMultiWriter extends SparkClientFunctionalTestHarness { - private static final String COW_TEST_TABLE_NAME = "testtable_COPY_ON_WRITE"; private static final Logger LOG = LogManager.getLogger(TestHoodieDeltaStreamerWithMultiWriter.class); String basePath; String propsFilePath; String tableBasePath; - int totalRecords; @ParameterizedTest @EnumSource(HoodieTableType.class) void testUpsertsContinuousModeWithMultipleWritersForConflicts(HoodieTableType tableType) throws Exception { // NOTE : Overriding the LockProvider to InProcessLockProvider since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts - setUpTestTable(tableType); + basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString(); + propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER; + tableBasePath = basePath + "/testtable_" + tableType; prepareInitialConfigs(fs(), basePath, "foo"); TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath); props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider"); props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"3000"); UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath); + // Keep it higher than batch-size to test continuous mode + int totalRecords = 3000; + + HoodieDeltaStreamer.Config prepJobConfig = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, + propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName())); + prepJobConfig.continuousMode = true; + prepJobConfig.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + prepJobConfig.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + HoodieDeltaStreamer prepJob = new HoodieDeltaStreamer(prepJobConfig, jsc()); + + // Prepare base dataset with some commits + deltaStreamerTestRunner(prepJob, prepJobConfig, (r) -> { + if (tableType.equals(HoodieTableType.MERGE_ON_READ)) { + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, fs()); + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, fs()); + } else { + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, fs()); + } + TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext()); + TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext()); + return true; + }); HoodieDeltaStreamer.Config cfgIngestionJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName())); @@ -125,12 +142,36 @@ void testUpsertsContinuousModeWithMultipleWritersForConflicts(HoodieTableType ta @EnumSource(HoodieTableType.class) void testUpsertsContinuousModeWithMultipleWritersWithoutConflicts(HoodieTableType tableType) throws Exception { // NOTE : Overriding the LockProvider to InProcessLockProvider since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts - setUpTestTable(tableType); + basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString(); + propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER; + tableBasePath = basePath + "/testtable_" + tableType; prepareInitialConfigs(fs(), basePath, "foo"); TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath); props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider"); props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"3000"); UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath); + // Keep it higher than batch-size to test continuous mode + int totalRecords = 3000; + + HoodieDeltaStreamer.Config prepJobConfig = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, + propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName())); + prepJobConfig.continuousMode = true; + prepJobConfig.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + prepJobConfig.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + HoodieDeltaStreamer prepJob = new HoodieDeltaStreamer(prepJobConfig, jsc()); + + // Prepare base dataset with some commits + deltaStreamerTestRunner(prepJob, prepJobConfig, (r) -> { + if (tableType.equals(HoodieTableType.MERGE_ON_READ)) { + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, fs()); + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, fs()); + } else { + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, fs()); + } + TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext()); + TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext()); + return true; + }); // create new ingestion & backfill job config to generate only INSERTS to avoid conflict props = prepareMultiWriterProps(fs(), basePath, propsFilePath); @@ -164,26 +205,41 @@ void testUpsertsContinuousModeWithMultipleWritersWithoutConflicts(HoodieTableTyp cfgIngestionJob2, backfillJob2, cfgBackfillJob2, false, "batch2"); } + @Disabled @ParameterizedTest @EnumSource(value = HoodieTableType.class, names = {"COPY_ON_WRITE"}) - public void testLatestCheckpointCarryOverWithMultipleWriters(HoodieTableType tableType) throws Exception { - testCheckpointCarryOver(tableType); - } - - private void testCheckpointCarryOver(HoodieTableType tableType) throws Exception { + void testLatestCheckpointCarryOverWithMultipleWriters(HoodieTableType tableType) throws Exception { // NOTE : Overriding the LockProvider to InProcessLockProvider since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts - setUpTestTable(tableType); + basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString(); + propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER; + tableBasePath = basePath + "/testtable_" + tableType; prepareInitialConfigs(fs(), basePath, "foo"); TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath); props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider"); props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY,"3000"); UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath); + // Keep it higher than batch-size to test continuous mode + int totalRecords = 3000; - HoodieDeltaStreamer.Config cfgIngestionJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, + HoodieDeltaStreamer.Config prepJobConfig = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName())); - cfgIngestionJob.continuousMode = true; - cfgIngestionJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); - cfgIngestionJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + prepJobConfig.continuousMode = true; + prepJobConfig.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords)); + prepJobConfig.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key())); + HoodieDeltaStreamer prepJob = new HoodieDeltaStreamer(prepJobConfig, jsc()); + + // Prepare base dataset with some commits + deltaStreamerTestRunner(prepJob, prepJobConfig, (r) -> { + if (tableType.equals(HoodieTableType.MERGE_ON_READ)) { + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, fs()); + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, fs()); + } else { + TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, fs()); + } + TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext()); + TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext()); + return true; + }); // create a backfill job with checkpoint from the first instant HoodieDeltaStreamer.Config cfgBackfillJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, @@ -292,26 +348,6 @@ private static HoodieDeltaStreamer.Config getDeltaStreamerConfig(String basePath return cfg; } - /** - * Specifically used for {@link TestHoodieDeltaStreamerWithMultiWriter}. - * - * The fixture test tables have random records generated by - * {@link org.apache.hudi.common.testutils.HoodieTestDataGenerator} using - * {@link org.apache.hudi.common.testutils.HoodieTestDataGenerator#TRIP_EXAMPLE_SCHEMA}. - * - * The COW fixture test table has 3000 unique records in 7 commits. - * The MOR fixture test table has 3000 unique records in 9 deltacommits and 1 compaction commit. - */ - private void setUpTestTable(HoodieTableType tableType) throws IOException { - basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString(); - propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER; - String fixtureName = String.format("fixtures/testUpsertsContinuousModeWithMultipleWriters.%s.zip", tableType.name()); - tableBasePath = prepareFixtureTable(Objects.requireNonNull(getClass() - .getClassLoader().getResource(fixtureName)), Paths.get(basePath)).toString(); - initDataGen(sqlContext(), tableBasePath + "/*/*.parquet", DEFAULT_PARTITION_NUM); - totalRecords = dataGeneratorMap.get(DEFAULT_PARTITION_NUM).getNumExistingKeys(TRIP_EXAMPLE_SCHEMA); - } - private void runJobsInParallel(String tableBasePath, HoodieTableType tableType, int totalRecords, HoodieDeltaStreamer ingestionJob, HoodieDeltaStreamer.Config cfgIngestionJob, HoodieDeltaStreamer backfillJob, HoodieDeltaStreamer.Config cfgBackfillJob, boolean expectConflict, String jobId) throws Exception { @@ -331,22 +367,22 @@ private void runJobsInParallel(String tableBasePath, HoodieTableType tableType, return true; }; - AtomicBoolean continousFailed = new AtomicBoolean(false); + AtomicBoolean continuousFailed = new AtomicBoolean(false); AtomicBoolean backfillFailed = new AtomicBoolean(false); try { Future regularIngestionJobFuture = service.submit(() -> { try { deltaStreamerTestRunner(ingestionJob, cfgIngestionJob, conditionForRegularIngestion, jobId); } catch (Throwable ex) { - continousFailed.set(true); + continuousFailed.set(true); LOG.error("Continuous job failed " + ex.getMessage()); throw new RuntimeException(ex); } }); Future backfillJobFuture = service.submit(() -> { try { - // trigger backfill atleast after 1 requested entry is added to timline from continuous job. If not, there is a chance that backfill will complete even before - // continous job starts. + // trigger backfill atleast after 1 requested entry is added to timeline from continuous job. If not, there is a chance that backfill will complete even before + // continuous job starts. awaitCondition(new GetCommitsAfterInstant(tableBasePath, lastSuccessfulCommit)); backfillJob.sync(); } catch (Throwable ex) { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieMultiTableDeltaStreamer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieMultiTableDeltaStreamer.java index 8eb91d24687c1..da5c6cc66a2ff 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieMultiTableDeltaStreamer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieMultiTableDeltaStreamer.java @@ -25,6 +25,7 @@ import org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer; import org.apache.hudi.utilities.deltastreamer.TableExecutionContext; import org.apache.hudi.utilities.schema.FilebasedSchemaProvider; +import org.apache.hudi.utilities.schema.SchemaRegistryProvider; import org.apache.hudi.utilities.sources.JsonKafkaSource; import org.apache.hudi.utilities.sources.ParquetDFSSource; import org.apache.hudi.utilities.sources.TestDataSource; @@ -49,12 +50,13 @@ public class TestHoodieMultiTableDeltaStreamer extends HoodieDeltaStreamerTestBa static class TestHelpers { - static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String configFolder, String sourceClassName, boolean enableHiveSync, boolean enableMetaSync) { - return getConfig(fileName, configFolder, sourceClassName, enableHiveSync, enableMetaSync, true, "multi_table_dataset"); + static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String configFolder, String sourceClassName, boolean enableHiveSync, boolean enableMetaSync, + Class clazz) { + return getConfig(fileName, configFolder, sourceClassName, enableHiveSync, enableMetaSync, true, "multi_table_dataset", clazz); } static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String configFolder, String sourceClassName, boolean enableHiveSync, boolean enableMetaSync, - boolean setSchemaProvider, String basePathPrefix) { + boolean setSchemaProvider, String basePathPrefix, Class clazz) { HoodieMultiTableDeltaStreamer.Config config = new HoodieMultiTableDeltaStreamer.Config(); config.configFolder = configFolder; config.targetTableName = "dummy_table"; @@ -64,7 +66,7 @@ static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String co config.sourceClassName = sourceClassName; config.sourceOrderingField = "timestamp"; if (setSchemaProvider) { - config.schemaProviderClassName = FilebasedSchemaProvider.class.getName(); + config.schemaProviderClassName = clazz != null ? clazz.getName() : FilebasedSchemaProvider.class.getName(); } config.enableHiveSync = enableHiveSync; config.enableMetaSync = enableMetaSync; @@ -74,7 +76,7 @@ static HoodieMultiTableDeltaStreamer.Config getConfig(String fileName, String co @Test public void testInvalidHiveSyncProps() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_HIVE_SYNC_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_HIVE_SYNC_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null); Exception e = assertThrows(HoodieException.class, () -> { new HoodieMultiTableDeltaStreamer(cfg, jsc); }, "Should fail when hive sync table not provided with enableHiveSync flag"); @@ -84,7 +86,7 @@ public void testInvalidHiveSyncProps() throws IOException { @Test public void testInvalidPropsFilePath() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true, true); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null); Exception e = assertThrows(IllegalArgumentException.class, () -> { new HoodieMultiTableDeltaStreamer(cfg, jsc); }, "Should fail when invalid props file is provided"); @@ -94,7 +96,7 @@ public void testInvalidPropsFilePath() throws IOException { @Test public void testInvalidTableConfigFilePath() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_TABLE_CONFIG_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true, true); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_TABLE_CONFIG_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null); Exception e = assertThrows(IllegalArgumentException.class, () -> { new HoodieMultiTableDeltaStreamer(cfg, jsc); }, "Should fail when invalid table config props file path is provided"); @@ -104,7 +106,7 @@ public void testInvalidTableConfigFilePath() throws IOException { @Test public void testCustomConfigProps() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false, false); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false, false, SchemaRegistryProvider.class); HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); TableExecutionContext executionContext = streamer.getTableExecutionContexts().get(1); assertEquals(2, streamer.getTableExecutionContexts().size()); @@ -114,13 +116,16 @@ public void testCustomConfigProps() throws IOException { assertEquals("_row_key", executionContext.getProperties().getString(DataSourceWriteOptions.RECORDKEY_FIELD().key())); assertEquals(TestHoodieDeltaStreamer.TestGenerator.class.getName(), executionContext.getProperties().getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key())); assertEquals("uber_hive_dummy_table", executionContext.getProperties().getString(HoodieMultiTableDeltaStreamer.Constants.HIVE_SYNC_TABLE_PROP)); + assertEquals("http://localhost:8081/subjects/random-value/versions/latest", executionContext.getProperties().getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP)); + assertEquals("http://localhost:8081/subjects/topic2-value/versions/latest", + streamer.getTableExecutionContexts().get(0).getProperties().getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP)); } @Test @Disabled public void testInvalidIngestionProps() { Exception e = assertThrows(Exception.class, () -> { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null); new HoodieMultiTableDeltaStreamer(cfg, jsc); }, "Creation of execution object should fail without kafka topic"); log.debug("Creation of execution object failed with error: " + e.getMessage(), e); @@ -139,7 +144,7 @@ public void testMultiTableExecutionWithKafkaSource() throws IOException { testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 5, HoodieTestDataGenerator.TRIP_SCHEMA))); testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA))); - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false, false); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false, false, null); HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); List executionContexts = streamer.getTableExecutionContexts(); TypedProperties properties = executionContexts.get(1).getProperties(); @@ -189,7 +194,7 @@ public void testMultiTableExecutionWithParquetSource() throws IOException { String parquetPropsFile = populateCommonPropsAndWriteToFile(); HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(parquetPropsFile, dfsBasePath + "/config", ParquetDFSSource.class.getName(), false, false, - false, "multi_table_parquet"); + false, "multi_table_parquet", null); HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); List executionContexts = streamer.getTableExecutionContexts(); @@ -219,7 +224,7 @@ public void testMultiTableExecutionWithParquetSource() throws IOException { @Test public void testTableLevelProperties() throws IOException { - HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false, false); + HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false, false, null); HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc); List tableExecutionContexts = streamer.getTableExecutionContexts(); tableExecutionContexts.forEach(tableExecutionContext -> { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java index f192ede73a159..dd25e7f8bebad 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotCopier.java @@ -96,7 +96,7 @@ public void testSnapshotCopy() throws Exception { new File(basePath + "/2016/05/01/").mkdirs(); new File(basePath + "/2016/05/02/").mkdirs(); new File(basePath + "/2016/05/06/").mkdirs(); - HoodieTestDataGenerator.writePartitionMetadata(fs, new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, + HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] {"2016/05/01", "2016/05/02", "2016/05/06"}, basePath); // Make commit1 File file11 = new File(basePath + "/2016/05/01/" + FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, "id11")); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java index 250e288294aca..1f15cc3093e7a 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestHoodieIncrSource.java @@ -20,12 +20,14 @@ import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.utilities.schema.SchemaProvider; @@ -46,6 +48,7 @@ import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; public class TestHoodieIncrSource extends HoodieClientTestHarness { @@ -61,21 +64,42 @@ public void tearDown() throws IOException { @Test public void testHoodieIncrSource() throws IOException { - HoodieWriteConfig writeConfig = getConfigBuilder(basePath).build(); + HoodieWriteConfig writeConfig = getConfigBuilder(basePath) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .archiveCommitsWith(2, 3).retainCommits(1).build()) + .withMetadataConfig(HoodieMetadataConfig.newBuilder() + .withMaxNumDeltaCommitsBeforeCompaction(1).build()) + .build(); SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context, writeConfig); - Pair> inserts = writeRecords(writeClient, true, null); - Pair> inserts2 = writeRecords(writeClient, true, null); - Pair> inserts3 = writeRecords(writeClient, true, null); + Pair> inserts = writeRecords(writeClient, true, null, "100"); + Pair> inserts2 = writeRecords(writeClient, true, null, "200"); + Pair> inserts3 = writeRecords(writeClient, true, null, "300"); + Pair> inserts4 = writeRecords(writeClient, true, null, "400"); + Pair> inserts5 = writeRecords(writeClient, true, null, "500"); // read everything upto latest - readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, 300, inserts3.getKey()); + readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.empty(), 500, inserts5.getKey()); + + // even if the begin timestamp is archived (100), full table scan should kick in, but should filter for records having commit time > 100 + readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.of("100"), 400, inserts5.getKey()); + + // even if the read upto latest is set, if begin timestamp is in active timeline, only incremental should kick in. + readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.of("400"), 100, inserts5.getKey()); // read just the latest - readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, 100, inserts3.getKey()); + readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.empty(), 100, inserts5.getKey()); + + // ensure checkpoint does not move + readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.of(inserts5.getKey()), 0, inserts5.getKey()); + + Pair> inserts6 = writeRecords(writeClient, true, null, "600"); + + // insert new batch and ensure the checkpoint moves + readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.of(inserts5.getKey()), 100, inserts6.getKey()); } - private void readAndAssert(IncrSourceHelper.MissingCheckpointStrategy missingCheckpointStrategy, int expectedCount, String expectedCheckpoint) { + private void readAndAssert(IncrSourceHelper.MissingCheckpointStrategy missingCheckpointStrategy, Option checkpointToPull, int expectedCount, String expectedCheckpoint) { Properties properties = new Properties(); properties.setProperty("hoodie.deltastreamer.source.hoodieincr.path", basePath); @@ -84,14 +108,18 @@ private void readAndAssert(IncrSourceHelper.MissingCheckpointStrategy missingChe HoodieIncrSource incrSource = new HoodieIncrSource(typedProperties, jsc, sparkSession, new TestSchemaProvider(HoodieTestDataGenerator.AVRO_SCHEMA)); // read everything until latest - Pair>, String> batchCheckPoint = incrSource.fetchNextBatch(Option.empty(), 500); + Pair>, String> batchCheckPoint = incrSource.fetchNextBatch(checkpointToPull, 500); Assertions.assertNotNull(batchCheckPoint.getValue()); - assertEquals(batchCheckPoint.getKey().get().count(), expectedCount); + if (expectedCount == 0) { + assertFalse(batchCheckPoint.getKey().isPresent()); + } else { + assertEquals(batchCheckPoint.getKey().get().count(), expectedCount); + } Assertions.assertEquals(batchCheckPoint.getRight(), expectedCheckpoint); } - public Pair> writeRecords(SparkRDDWriteClient writeClient, boolean insert, List insertRecords) throws IOException { - String commit = writeClient.startCommit(); + public Pair> writeRecords(SparkRDDWriteClient writeClient, boolean insert, List insertRecords, String commit) throws IOException { + writeClient.startCommitWithTime(commit); List records = insert ? dataGen.generateInserts(commit, 100) : dataGen.generateUpdates(commit, insertRecords); JavaRDD result = writeClient.upsert(jsc.parallelize(records, 1), commit); List statuses = result.collect(); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlSource.java index 9c3d5584a5dd7..e4ca51842e87e 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlSource.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/TestSqlSource.java @@ -40,6 +40,7 @@ import java.io.IOException; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; /** @@ -135,6 +136,23 @@ public void testSqlSourceRowFormat() throws IOException { assertEquals(10000, fetch1AsRows.getBatch().get().count()); } + /** + * Runs the test scenario of reading data from the source in row format. + * Source has no records. + * + * @throws IOException + */ + @Test + public void testSqlSourceCheckpoint() throws IOException { + props.setProperty(sqlSourceConfig, "select * from test_sql_table where 1=0"); + sqlSource = new SqlSource(props, jsc, sparkSession, schemaProvider); + sourceFormatAdapter = new SourceFormatAdapter(sqlSource); + + InputBatch> fetch1AsRows = + sourceFormatAdapter.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE); + assertNull(fetch1AsRows.getCheckpointForNextBatch()); + } + /** * Runs the test scenario of reading data from the source in row format. * Source has more records than source limit. diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/JdbcTestUtils.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/JdbcTestUtils.java index 377063eb045e9..79173dbdc8a0c 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/JdbcTestUtils.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/JdbcTestUtils.java @@ -20,6 +20,7 @@ package org.apache.hudi.utilities.testutils; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; @@ -81,7 +82,8 @@ public static List insert(String commitTime, int numRecords, Conne .stream() .map(r -> { try { - return ((GenericRecord) r.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get()); + return ((GenericRecord) ((HoodieAvroRecord) r).getData() + .getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get()); } catch (IOException e) { return null; } @@ -125,7 +127,7 @@ public static List update(String commitTime, List in List updateRecords = dataGenerator.generateUpdates(commitTime, inserts); updateRecords.stream().map(m -> { try { - return m.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get(); + return ((HoodieAvroRecord) m).getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get(); } catch (IOException e) { return null; } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java index 90a3f5af38021..8464740bf2bf0 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/testutils/UtilitiesTestBase.java @@ -20,6 +20,7 @@ import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieAvroRecord; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -378,7 +379,7 @@ public static TypedProperties setupSchemaOnDFSWithAbsoluteScope(String scope, St public static GenericRecord toGenericRecord(HoodieRecord hoodieRecord, Schema schema) { try { - Option recordOpt = hoodieRecord.getData().getInsertValue(schema); + Option recordOpt = ((HoodieAvroRecord) hoodieRecord).getData().getInsertValue(schema); return (GenericRecord) recordOpt.get(); } catch (IOException e) { return null; diff --git a/hudi-utilities/src/test/resources/delta-streamer-config/short_trip_uber_config.properties b/hudi-utilities/src/test/resources/delta-streamer-config/short_trip_uber_config.properties index 243afc90f3742..75d74d6bc8932 100644 --- a/hudi-utilities/src/test/resources/delta-streamer-config/short_trip_uber_config.properties +++ b/hudi-utilities/src/test/resources/delta-streamer-config/short_trip_uber_config.properties @@ -22,4 +22,6 @@ hoodie.deltastreamer.source.kafka.topic=topic2 hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP hoodie.deltastreamer.keygen.timebased.input.dateformat=yyyy-MM-dd HH:mm:ss.S hoodie.datasource.hive_sync.table=short_trip_uber_hive_dummy_table -hoodie.datasource.write.keygenerator.class=org.apache.hudi.utilities.functional.TestHoodieDeltaStreamer$TestTableLevelGenerator \ No newline at end of file +hoodie.datasource.write.keygenerator.class=org.apache.hudi.utilities.functional.TestHoodieDeltaStreamer$TestTableLevelGenerator +hoodie.deltastreamer.schemaprovider.registry.baseUrl=http://localhost:8081/subjects/ +hoodie.deltastreamer.schemaprovider.registry.urlSuffix=-value/versions/latest \ No newline at end of file diff --git a/hudi-utilities/src/test/resources/delta-streamer-config/uber_config.properties b/hudi-utilities/src/test/resources/delta-streamer-config/uber_config.properties index 3d3501fec73d3..f5b079265d438 100644 --- a/hudi-utilities/src/test/resources/delta-streamer-config/uber_config.properties +++ b/hudi-utilities/src/test/resources/delta-streamer-config/uber_config.properties @@ -22,4 +22,6 @@ hoodie.deltastreamer.source.kafka.topic=topic1 hoodie.deltastreamer.keygen.timebased.timestamp.type=UNIX_TIMESTAMP hoodie.deltastreamer.keygen.timebased.input.dateformat=yyyy-MM-dd HH:mm:ss.S hoodie.datasource.hive_sync.database=uber_hive_db -hoodie.datasource.hive_sync.table=uber_hive_dummy_table \ No newline at end of file +hoodie.datasource.hive_sync.table=uber_hive_dummy_table +hoodie.deltastreamer.schemaprovider.registry.url=http://localhost:8081/subjects/random-value/versions/latest +hoodie.deltastreamer.schemaprovider.registry.targetUrl=http://localhost:8081/subjects/random-value/versions/latest \ No newline at end of file diff --git a/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.COPY_ON_WRITE.zip b/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.COPY_ON_WRITE.zip deleted file mode 100644 index 299b070bee34a..0000000000000 Binary files a/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.COPY_ON_WRITE.zip and /dev/null differ diff --git a/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.MERGE_ON_READ.zip b/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.MERGE_ON_READ.zip deleted file mode 100644 index d80439d20d3df..0000000000000 Binary files a/hudi-utilities/src/test/resources/fixtures/testUpsertsContinuousModeWithMultipleWriters.MERGE_ON_READ.zip and /dev/null differ diff --git a/packaging/hudi-flink-bundle/pom.xml b/packaging/hudi-flink-bundle/pom.xml index 066cefb1ec2b3..222478090b4b0 100644 --- a/packaging/hudi-flink-bundle/pom.xml +++ b/packaging/hudi-flink-bundle/pom.xml @@ -136,7 +136,8 @@ org.apache.hive:hive-common org.apache.hive:hive-service org.apache.hive:hive-service-rpc - org.apache.hive:hive-exec + org.apache.hive:hive-exec + org.apache.hive:hive-standalone-metastore org.apache.hive:hive-metastore org.apache.hive:hive-jdbc org.datanucleus:datanucleus-core @@ -161,10 +162,6 @@ org.apache.avro. ${flink.bundle.shade.prefix}org.apache.avro. - - org.apache.parquet. - ${flink.bundle.shade.prefix}org.apache.parquet. - com.yammer.metrics. ${flink.bundle.shade.prefix}com.yammer.metrics. @@ -173,46 +170,6 @@ com.beust.jcommander. ${flink.bundle.shade.prefix}com.beust.jcommander. - - org.apache.hive.jdbc. - ${flink.bundle.shade.prefix}org.apache.hive.jdbc. - - - org.apache.hadoop.hive.metastore. - ${flink.bundle.shade.prefix}org.apache.hadoop.hive.metastore. - - - org.apache.hive.common. - ${flink.bundle.shade.prefix}org.apache.hive.common. - - - org.apache.hadoop.hive.common. - ${flink.bundle.shade.prefix}org.apache.hadoop.hive.common. - - - org.apache.hadoop.hive.conf. - ${flink.bundle.shade.prefix}org.apache.hadoop.hive.conf. - - - org.apache.hive.service. - ${flink.bundle.shade.prefix}org.apache.hive.service. - - - org.apache.hadoop.hive.service. - ${flink.bundle.shade.prefix}org.apache.hadoop.hive.service. - - - org.apache.hadoop.hive.ql.metadata. - ${flink.bundle.shade.prefix}org.apache.hadoop.hive.ql.metadata. - - - org.apache.hadoop.hive.ql.optimizer. - ${flink.bundle.shade.prefix}org.apache.hadoop.hive.ql.optimizer. - - - org.apache.hadoop.hive.ql.lockmgr. - ${flink.bundle.shade.prefix}org.apache.hadoop.hive.ql.lockmgr. - com.codahale.metrics. ${flink.bundle.shade.prefix}com.codahale.metrics. @@ -687,6 +644,12 @@ ${hive.version} ${flink.bundle.hive.scope} + + ${hive.groupid} + hive-standalone-metastore + ${hive.version} + ${flink.bundle.hive.scope} + diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index 23399233e670a..f6215b1e017a5 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -67,9 +67,6 @@ org.apache.hudi:hudi-common org.apache.hudi:hudi-hadoop-mr - - org.scala-lang:scala-library - org.apache.parquet:parquet-avro org.apache.avro:avro com.esotericsoftware:kryo-shaded @@ -155,14 +152,6 @@ ${project.version} - - - - org.scala-lang - scala-library - ${scala.version} - - org.apache.parquet diff --git a/packaging/hudi-hive-sync-bundle/pom.xml b/packaging/hudi-hive-sync-bundle/pom.xml index 9b775e76c7b48..75fce574eb3d6 100644 --- a/packaging/hudi-hive-sync-bundle/pom.xml +++ b/packaging/hudi-hive-sync-bundle/pom.xml @@ -69,9 +69,6 @@ org.apache.hudi:hudi-sync-common org.apache.hudi:hudi-hive-sync - - org.scala-lang:scala-library - com.beust:jcommander org.apache.avro:avro org.apache.parquet:parquet-avro @@ -134,14 +131,6 @@ ${project.version} - - - - org.scala-lang - scala-library - ${scala.version} - - org.apache.parquet diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index f085c30b48d57..90c1087dcb4d2 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -67,9 +67,6 @@ org.apache.hudi:hudi-common org.apache.hudi:hudi-hadoop-mr - - org.scala-lang:scala-library - org.apache.parquet:parquet-avro org.apache.avro:avro org.codehaus.jackson:* @@ -190,14 +187,6 @@ - - - - org.scala-lang - scala-library - ${scala.version} - - org.apache.parquet diff --git a/packaging/hudi-trino-bundle/pom.xml b/packaging/hudi-trino-bundle/pom.xml index a7f41ecaf177a..adf73f1bb0b83 100644 --- a/packaging/hudi-trino-bundle/pom.xml +++ b/packaging/hudi-trino-bundle/pom.xml @@ -68,9 +68,6 @@ org.apache.hudi:hudi-common org.apache.hudi:hudi-hadoop-mr - - org.scala-lang:scala-library - org.apache.parquet:parquet-avro org.apache.avro:avro org.codehaus.jackson:* @@ -189,14 +186,6 @@ - - - - org.scala-lang - scala-library - ${scala.version} - - org.apache.hbase diff --git a/pom.xml b/pom.xml index 2778885e8312e..1b28ae1bb9a49 100644 --- a/pom.xml +++ b/pom.xml @@ -117,9 +117,9 @@ 4.4.1 ${spark2.version} - 1.13.1 + 1.14.3 2.4.4 - 3.2.0 + 3.2.1 hudi-spark2 hudi-spark2-common 1.8.2 @@ -164,7 +164,7 @@ 4.7 1.12.22 3.17.3 - 3.1.0 + 3.11.4 1.1.0 8000 http://localhost:${dynamodb-local.port} @@ -349,6 +349,7 @@ 3 @{argLine} + false ${surefire-log4j.file} @@ -1119,6 +1120,12 @@ awaitility ${awaitility.version} test + + + org.objenesis + objenesis + + @@ -1586,7 +1593,9 @@ hudi-spark3-common 3.1.0 2.4.1 - 1.12.1 + 1.12.2 + 1.10.2 + 1.6.12 ${fasterxml.spark3.version} ${fasterxml.spark3.version} ${fasterxml.spark3.version} diff --git a/rfc/README.md b/rfc/README.md index 63b81a884fc2c..a9587d1d79cf3 100644 --- a/rfc/README.md +++ b/rfc/README.md @@ -60,12 +60,13 @@ The list of all RFCs can be found here. | 34 | [Hudi BigQuery Integration (WIP)](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=188745980) | `UNDER REVIEW` | | 35 | [Make Flink MOR table writing streaming friendly](https://cwiki.apache.org/confluence/display/HUDI/RFC-35%3A+Make+Flink+MOR+table+writing+streaming+friendly) | `UNDER REVIEW` | | 36 | [HUDI Metastore Server](https://cwiki.apache.org/confluence/display/HUDI/%5BWIP%5D+RFC-36%3A+HUDI+Metastore+Server) | `UNDER REVIEW` | -| 37 | [Hudi metadata based bloom index] | `UNDER REVIEW` | -| 38 | [Spark Datasource V2 Integration] | `UNDER REVIEW` | +| 37 | [Hudi Metadata based Bloom Index](rfc-37/rfc-37.md) | `IN PROGRESS` | +| 38 | [Spark Datasource V2 Integration](./rfc-38/rfc-38.md) | `IN PROGRESS` | | 39 | [Incremental source for Debezium](./rfc-39/rfc-39.md) | `IN PROGRESS` | -| 40 | [Hudi Connector for Trino] | `UNDER REVIEW` | +| 40 | [Hudi Connector for Trino](./rfc-40/rfc-40.md) | `IN PROGRESS` | | 41 | [Hudi Snowflake Integration] | `UNDER REVIEW` | | 42 | [Consistent Hashing Index](./rfc-42/rfc-42.md) | `UNDER REVIEW` | | 43 | [Compaction / Clustering Service](./rfc-43/rfc-43.md) | `UNDER REVIEW` | | 44 | [Hudi Connector for Presto](./rfc-44/rfc-44.md) | `UNDER REVIEW` | | 45 | [Asynchronous Metadata Indexing](./rfc-45/rfc-45.md) | `UNDER REVIEW` | +| 46 | [Optimizing Record Payload Handling](./rfc-46/rfc-46.md) | `UNDER REVIEW` | diff --git a/rfc/rfc-27/col_stats.png b/rfc/rfc-27/col_stats.png new file mode 100644 index 0000000000000..76aa6ab44e7c2 Binary files /dev/null and b/rfc/rfc-27/col_stats.png differ diff --git a/rfc/rfc-27/rfc-27.md b/rfc/rfc-27/rfc-27.md new file mode 100644 index 0000000000000..3b00af7c140a0 --- /dev/null +++ b/rfc/rfc-27/rfc-27.md @@ -0,0 +1,444 @@ + +# RFC-[27]: [Data skipping Index to improve query performance] + +## Proposers + +- @manojpec +- @shivnarayan +- @satish.kotha + +## Approvers +- @rmpifer +- @uditme + +## Status + +JIRA: https://issues.apache.org/jira/browse/HUDI-1822 + +> Please keep the status updated in `rfc/README.md`. + +## Abstract + +Query engines typically scan large amounts of irrelevant data for query planning and execution. Some workarounds are +available to reduce amount of irrelevant data scanned. These include +- Partition pruning +- File pruning
    + - Some data file formats contain metadata including range information for certain columns (for parquet, this metadata + is stored in footer). + - As part of query planning, all range information from data files is read. + - Irrelevant data files are then pruned based on predicates and available range information + +Partition pruning typically puts the burden on users to select partitions where the data may exist. File pruning approach + is expensive and does not scale if there are large number of partitions and data files to be scanned. So we propose a + new solution to store additional information as part of Hudi metadata table to implement data skipping index. The + goals of data skipping index is to provide: + +- Global index: Users query for information they need without need for specifying partitions. Index can effectively find + data files in the table. +- Improve query plan: Efficiently find data files that have information for specified query predicates. +- Support multiple types of index: Initial implementation may provide range index. But goal is provide flexible + framework to implement other types of index (e.g. bloom) + +## Background +RFC-15 added metadata table support to Hudi for optimized file listing. RFC-37 is adding metadata index and column stats +as another partition to metadata table. This RFC will piggyback on the column stats partition that RFC-37 will be adding +to metadata table. + +Notes: Effectiveness of the index will be proportional to how data is layed out. If every file contains data for +commonly specified query predicate, index may not be very effective. + +## Implementation +At a high level there are 3 components to implement index support: +- Storage format +- Metadata generation +- Query engine integration. + +### Column_Stats Index/Partition +We want to support multiple types of index (range, bloom etc). So it is important to generate different types of record +for different columns. Focus of this RFC will be column range or column stats index. i.e min, max values, null counts etc. +Users can configure the commonly queried columns and columns stats partition in metadata table will store all stats pertaining +to the configured columns for every valid data file where the column is present. + +Similar to how we generate records for files partition in metadata table, we will generate HoodieMetadataRecord +for column stats partition on any commit that gets applied to metadata table. Basic building block of metadata table used +for file listing will be used for this column stats partition as well (how updates are applied to metadata table, +how invalid data is ignored, etc) + +Column_stats partition stores statistics for all indexed columns in the Hudi data table. The index maintained in this +partition helps +Predicate pushing/data skipping - file filtering based on column predicates + +For the purpose of column predicate filtering, this partition can store statistics for any column as per configs. + +So, high level requirement for this column_stats partition is (pertaining to this RFC): + - Given a list of columns and predicates(and optionally partitions), return a list of matching file names + +### Storage format +To cater to the above requirement, we plan to encode column name, partition path and file name to the keys in HFile. +Since HFile supports efficient range/prefix search, our look up should be very fast. + +![Column Stats Partition](col_stats.png) + +We plan to generate unique and random and unique hash IDs for all 3 components +- ColumnID : + - base64(hash32(column name)) + - on-disk size = 12bytes per col_stat per file +- PartitionID: + - base64(hash32(partition name)) + - on-disk size = 12bytes per partition +- FileID: + - base64(hash128(file name)) + - on-disk size = 24bytes per file + +#### Design Choices for ID generation +1. Incremental IDs: Sequentially increasing IDs can be generated in the context of the ongoing commit/write. ID can always start at 1 and to make the full ID unique enough, sequential IDs can be appended with the ongoing commit time. + a. Pros: + ID is simple to generate, doesn't depend on key lookups for resuming the ID generation across writers. + Overall ID can be shorter than Hash based IDs and can still be unique + Differential/delta encoding goes good with sequential numbers and can get high compression ratio (though we didn't see this in the tests) + b. Cons: + Same column can be given several IDs across several commits spilled over several files. Complex merging logic is needed to coalesce them all when looking up for any interested columns. + Doesn't go good with schema evolution. Even without schema evolution, changing IDs for the same column by itself is small schema evolution problem. + +2. Hash IDs: Hashing utilities can be used to generate unique and random IDs of any length for the given column/partition/file name. + a. Pros: + Deterministic Name to ID generation + Reverse lookup of ID to name is possible by relatively much smaller meta index read + ID length can be controlled for the scaling needs + Sharding and locality can be controlled by prefixing with more bits (doable by Incremental IDs also) + b. Cons: + Big scale deployments demand a huge ID space for files there by needing to generate 128 bits hashes + These are usually 32 digit hex chars, taking up at least 32 bytes/ID on disk. However, base64 encoding can help to shave off few bytes and get them to 24 bytes. + Takes up larger space in-memory and on-disk compared to Sequential IDs. Theoretically, the compression ratio should be lesser compared to Sequential IDs. + +Key format in column_stats partition
    +- [colId][PartitionId][FileId] +- [colId]+"agg"+[PartitionId] + +First type will be used to store one entry per column per file. And second type will be used to store one aggregated +entry per column per partition. This will be a fixed size key. Lookups don't have to search for ID delimiters as in the +case of incremental IDs. + +These key encodings fit in well to serve our requirements. +Since we are using Hfile as the format, all keys are going to be sorted and hence range read will be very effective for +our use-case as we have chosen the key format consciously having this in mind. + +Given a list of columns and optionally partitions, return a list of matching file names. + +1. We can do prefix search of [ColumnID] or [ColumnID][PartitionID] + - If both columnId and partitionIds are supplied, we will do range read of [colId][partitionId]. + - If list of partitions not available as part of query, we will first look up [colId]+"agg" to do prefix search + for partition level stats. Filter for those partitions which matches the predicates and then follow (1) as in previous line. + +2. Fetch only interested entries for [colId][partitionId] list. +3. Will look up the stats and filter for matching FileIDs +4. Reverse lookup in Files partition to get FileID to FileName mapping. + +Note: +As you could see here, reverse look up of FileId to fileName mapping has to go into "Files" partition to satisfy our requirement. +So, "Files" partition will be added with additional entries of fileId to fileName mappings on the write path. + +#### Sharding: +Any partition in metadata table needs to be instantiated with N file groups/shards upfront. "Files" partition is small and hence +we went with just one file group. But for record level index, we can't go with single file group and had to shard the data. +We will employ some kind of hashing mechanism for key to file group mapping. On the write path, entries will be sharded +and written to different file groups. On the read path, key to be looked up will be hashed to find the right file group +to be looked up. For wild card search, all file groups will be looked up. + +// To be revisited.
    +We plan to instantiate the number of file groups in column stats partition based on number of columns being indexed. +We can't estimate the data scale upfront, to which the table might grow eventually and hence have to go with some estimates. +So a rough idea is to instantiate one file group for 10 columns being indexed. Or get some rough input from the user whether +the table will be a small/medium/large scale and determine based on that. + +Similar to how we generate records for files partition in metadata table, we will generate HoodieMetadataRecord +for column stats partition on any commit that gets applied to metadata table. + +### Metadata generation +The existing metadata payload schema will be extended and shared for this new "column_stats" partition also. The type +field will be used to detect the column stats payload record. Here is the schema for the column stats payload record. + +``` + "namespace": "org.apache.hudi.avro.model", + "type": "record", + "name": "HoodieMetadataRecord", + "doc": "A record saved within the Metadata Table", + "fields": [ + { + "name": "key", + "type": "string" + }, + { + "name": "type", + "doc": "Type of the metadata record", + "type": "int" + }, + { "name": "filesystemMetadata", + . + . + . + }, + { + "name": "ColumnStatsMetadata", + "doc": "Contains information about column statistics for all data files in the table", + "type": [ + "null", + { + "type": "record", + "name": "HoodieColumnStats", + "fields": [ + { + "name": "rangeLow", + "type": [ + "null", + "bytes" + ], + "doc": "Low end of the range. For now, this is a String. Based on main data table schema, we can convert it to appropriate type" + }, + { + "name": "rangeHigh", + "type": [ + "null", + "bytes" + ], + "doc": "High end of the range. For now, this is a String. Based on main data table schema, we can convert it to appropriate type" + }, + { + "name":"total_values", + "type":["long", "null"], + "doc" : "Stores total values for this column in the resepective data file" + }, + { + "name":"total_nulls", + "type":["long", "null"], + "doc" : "Stores total null values for this column in the resepective data file" + }, + { + "name":"total_nans", + "type":["long", "null"], + "doc" : "Stores total Nan values for this column in the resepective data file" + }, + { + "name":"total_size_on_disk", + "type":["long", "null"], + "doc" : "Stores total size occupied by this column on disk corresponding to the resepective data file" + }, + { + "name": "isDeleted", + "type": "boolean", + "doc": "True if this file has been deleted" + } + ] + } + ] + } +``` + +Column stats records hold all stats for the file. The key for the column stat record would be an +encoded string as discussed earlier. + +``` +key = base64_encode(hash64(column name) + hash64(partition name) + hash128(file path)) +key = base64_encode(hash64(column name) + "agg" + hash64(partition name)) +``` + +While Hash based IDs have quite a few desirable properties in the context of Hudi index lookups, there is an impact +on the column level schema changes though. Refer to [Schema Evolution](#Schema-Evolution) section for more details. + +#### Writer flow +Let's walk through the writer flow to update column_stats partition in metadata table. + +1. Files partition - prepare records for adding // just calling out whats required in the context of column_stats + partition. General files partition will be updated as usual to store file listing information. + - FileID => FileName mapping entries + - PartitionID => PartitionName entry, if not already exists + - Since these IDs are hash based IDs, no look up of prior usages is required here. If not, we need to know what was + the last assigned ID and then go about assigning new incremental/sequential IDs, which slows down the performance significantly +2. Column_stats partition - prepare records for adding + - [ColumnID][PartitionID][FileID] => ColumnStat + - [ColumnId]"agg"[PartitionId] => ColumnStat + - This involves reading the base file footers to fetch min max and other stats to populate values for the record. +d. Commit all these records to metadata table. + +We need to ensure we have all sufficient info in WriteStatus/Commit Metadata that gets passed to metadata writer for +every commit. Reading parquet footers and meta is unavoidable, but other than that, we should try to embed all other info +in the WriteStatus. + +### Index integrations with query engines + +#### Spark +We already added support for z-ordering with 0.10.0. So, we will re-use data skipping code paths from there. + +Here is the high level flow of z-ordering: +##### Write path +1. Sort the data (Z-order/Hilbert/Linear) + - Being triggered by Clustering (right now) + - RDDSpatialCurveOptimizationSortPartitioner +2. Build "Col Stats" Index (.hoodie/.colstatsindex) + - Upon Clustering completion we invoke ColumnStatsIndexHelper.updateColumnStatsIndexFor + +##### Read path +1. (Spark SQL) Asks for a list of files to fetch data from + - HoodieFileIndex.listFiles +2. HoodieFileIndex will read Col Stats Index and apply the data predicates to fetch list of candidate files from it +3. Returns it back to Spark + +Given this, lets see how we can integrate the new column_stats partition. + +##### Z-order Write path +1. Sort the data (Z-order/Hilbert/Linear) + - Being triggered by Clustering (right now) + - RDDSpatialCurveOptimizationSortPartitioner +2. Do not do anything. + - Upon Clustering completion, replace commit will get applied to metadata table by default if metadata is enabled. + +##### Read path +1. (Spark SQL) Asks for a list of files to fetch data from + - HoodieFileIndex.listFiles +2. HoodieFileIndex will read Col Stats partition in metadata table and apply the data predicates to fetch list of candidate files from it +3. Returns it back to Spark + +One caveat: +But we can't get rid of z-order index completely though right away. If metadata table is not build out yet or has entered +an inconsistent state and is not usable, we have to go the existing way of building an index at the end of z-order clustering. + +### Predicate filtering + +#### How to apply query predicates in Hudi? +Query predicates are normally constructed in a tree like structure so this will follow same pattern. The proposal is +create a mapping utility from “Engine” query predicates to a HudiExpression. This way filtering logic is engine agnostic + +For AND and OR operators we can translate to a tree node with left and right expressions. An example is shown below of what the structure would look + +```java +public class HudiExpressionParentNode implements HudiExpression { + HudiExpression left; + HudiExpression right; + + @override + boolean evaluate() { + left.evaluate() && right.evaluate() + } +} +``` + +For LEAF nodes we can create expression which contains the operator and value we are comparing to determine whether the +file group may have data relevant to this query. The common search expressions for the leaf nodes: + +1. Equal to - if value in search expression greater than or equal to lower bound and is less than or equal to upper bound + in file’s column statistics then true, else false +2. Less than - if value in search expression is greater than lower bound in file’s column statistics then true, else false +3. Less than or equal to - if value in search expression is greater than or equal to lower bound in file’s column statistics + then true, else false +4. Greater than - if value in search expression is lower than upper bound in file’s column statistics then true, else false +5. Greater than or equal to - if value in search expression is lower than or equal to upper bound in file’s column statistics + then true, else false + +True tells us that there is a possibility that the file contains data which matches the search expression and to include +in result set. False tells us that there is no possibility this file contains any data which matches the search +expression and to exclude from the results. + +```java +public class HudiExpressionLeafNode implements HudiExpression { + + Operator op; // (EQ, LT, LTEQ, GT, GTEQ) + T literal; // (INT, DOUBLE, FLOAT value) + String column; + + @override + boolean evaluate() +} +``` + +This way we can call evaluate on the root HudiExpression tree and it will determine whether the entire expression is +satisfied for the file group. + +#### Hive +In order for us to implement predicate push down in Hive we need to have access to the query predicate. Query predicate +is not passed to InputFormat by default. HiveStoragePredicateHandler interface needs to be implemented in order to +provide query predicate to InputFormat and for this we need to create a custom HiveStorageHandler. Therefore we will +be creating new storage handler HudiStorageHandler. + +```java +public interface HiveStorageHandler extends Configurable { + public Class getInputFormatClass(); + public Class getOutputFormatClass(); + public Class getSerDeClass(); + public HiveMetaHook getMetaHook(); + public void configureTableJobProperties( + TableDesc tableDesc, + Map jobProperties); +} +``` + +Everything will remain same with input format, output format, and serde classes being used in existing Hudi tables +registered in Hive (HoodieParquetInputFormat still being used). HudiStorageHandler would implement HiveStorageHandler +and HiveStoragePredicateHandler. + +Hive adds the query predicate returned by the Storage Handler to the job configuration. This job configuration is then +supplied to the Input Format. It can be fetched and deserialized using the following: + +```java + String hiveFilter = jobConf.get(TableScanDesc.FILTER_EXPR_CONF_STR); + if (hiveFilter != null) { + ExprNodeGenericFuncDesc exprNodeDesc = SerializationUtilities + .deserializeObject(hiveFilter, ExprNodeGenericFuncDesc.class); + SearchArgument sarg = ConvertAstToSearchArg.create(job, exprNodeDesc); +``` + +The SearchArgument contains an ExpressionTree and a list of PredicateLeaf. The ExpressionTree is a tree structure used +to define the query predicate. If operator is defined as OR, AND, or NOT this indicates there are children expressions, +normally LEAFs. + +```java +public class ExpressionTree { + public enum Operator {OR, AND, NOT, LEAF, CONSTANT} + private final Operator operator; + private final List children; + private int leaf; +``` + +If operator in ExpressionTree is defined as LEAF it corresponds to a PredicateLeaf defined in the Search Argument. +PredicateLeaf will contain information about the query predicate such as operator, column name, and literal which is +being compared + +```java + private final org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf.Operator operator; + private final Type type; + private String columnName; + private final Object literal; + private final List literalList; + +``` + +We can use this information and the SearchArgument to generate our HudiExpression. Then in HoodieParquetInputFormat.listStatus() +after fetching files from FileSystemView for the remaining file groups we can apply HudieExpression using column metadata. + +#### Presto +To be filled. + +## Rollout/Adoption Plan + +- What impact (if any) will there be on existing users? +- If we are changing behavior how will we phase out the older behavior? +- If we need special migration tools, describe them here. +- When will we remove the existing behavior + +## Test Plan + +Describe in few sentences how the RFC will be tested. How will we know that the implementation works as expected? How will we know nothing broke?. \ No newline at end of file diff --git a/rfc/rfc-37/metadata_index_1.png b/rfc/rfc-37/metadata_index_1.png new file mode 100644 index 0000000000000..40b834f40f9b8 Binary files /dev/null and b/rfc/rfc-37/metadata_index_1.png differ diff --git a/rfc/rfc-37/metadata_index_bloom_partition.png b/rfc/rfc-37/metadata_index_bloom_partition.png new file mode 100644 index 0000000000000..8ada4b7f2c18f Binary files /dev/null and b/rfc/rfc-37/metadata_index_bloom_partition.png differ diff --git a/rfc/rfc-37/metadata_index_col_stats.png b/rfc/rfc-37/metadata_index_col_stats.png new file mode 100644 index 0000000000000..02a77fe0dd6d2 Binary files /dev/null and b/rfc/rfc-37/metadata_index_col_stats.png differ diff --git a/rfc/rfc-37/rfc-37.md b/rfc/rfc-37/rfc-37.md new file mode 100644 index 0000000000000..28d27b399482e --- /dev/null +++ b/rfc/rfc-37/rfc-37.md @@ -0,0 +1,329 @@ + +# RFC-37: Metadata based Bloom Index + +## Proposers +- @nsivabalan +- @manojpec + +## Approvers + - @vinothchandar + - @satishkotha + +## Status +JIRA: https://issues.apache.org/jira/browse/HUDI-2703 + +## Abstract +Hudi maintains several indices to locate/map incoming records to file groups during writes. Most commonly used record +index is the HoodieBloomIndex. Larger tables and global index has performance issues as the bloom filter from a large +number of data files needed to be read and looked up. Reading from several files over the cloud object storage like S3 +also faces request throttling issues. We are proposing to build a new Metadata index (metadata table based bloom index) +to boost the performance of existing bloom index. + +## Background + +HoodieBloomIndex is used to find the location of incoming records during every write. Bloom index assists Hudi in +deterministically routing records to a given file group and to distinguish inserts vs updates. This aggregate bloom +index is built from several bloom filters stored in the base file footers. Prior to bloom filter lookup, the file +pruning for the incoming records is also done based on the record key min/max stats stored in the base file footers. In +this RFC, we plan to build a new index for the bloom filters under the metadata table which to assist in bloom index +based record location tagging. This overlaps +with [RFC-27 Data skipping index ](https://cwiki.apache.org/confluence/display/HUDI/RFC-27+Data+skipping+index+to+improve+query+performance) +in the read path for improving the query performance. + +## Design +HoodieBloomIndex involves the following steps to find the right location of incoming records + +1. Find all the interested partitions and list all its data files. +2. File Pruning: Load record key min/max details from all the interested data file footers. Filter files and generate + files to keys mapping for the incoming records based on the key ranges using range interval tree built from + previously loaded min/max details. +3. Bloom Filter lookup: Filter files and prune files to keys mapping for the incoming keys mapping based on the bloom + filter key lookup +4. Final Look up in actual data files to find the right location of every incoming record + +As we could see from step 1 and 2, we are in need of min and max values for "_hoodie_record_key" and bloom filters +from all interested data files to perform the location tagging. In this design, we will add these key stats and +bloom filter to the metadata table and thereby able to quickly load the interested details and do faster lookups. + +Metadata table already has one partition `files` to help in partition file listing. For the metadata table based +indices, we are proposing to add following two new partitions: +1. `bloom_filter` - for the file level bloom filter +2. `column_stats` - for the key range stats + +Why metadata table: +Metadata table uses HBase HFile - the tree map file format to store and retrieve data. HFile is an indexed file format +and supports map like faster lookups by keys. Since, we will be storing stats/bloom for every file and the index will do +lookups based on files, we should be able to benefit from the faster lookups in HFile. + +High Level Metadata Index Design + +Following sections will talk about different partitions, key formats and then dive into the data and control flows. + +### MetaIndex/BloomFilter: + +A new partition `bloom_filter` will be added under the metadata table. Bloom filters from all the base files in the +data table will be added here. Metadata table is already in the HFile format. The existing metadata payload schema will +be extended and shared for this partition also. The type field will be used to detect the bloom filter payload record. +Here is the schema for the bloom filter payload record. +``` + { + "doc": "Metadata Index of bloom filters for all data files in the user table", + "name": "BloomFilterMetadata", + "type": [ + "null", + { + "doc": "Data file bloom filter details", + "name": "HoodieMetadataBloomFilter", + "type": "record", + "fields": [ + { + "doc": "Bloom filter type code", + "name": "type", + "type": "string" + }, + { + "doc": "Instant timestamp when this metadata was created/updated", + "name": "timestamp", + "type": "string" + }, + { + "doc": "Bloom filter binary byte array", + "name": "bloomFilter", + "type": "bytes" + }, + { + "doc": "Bloom filter entry valid/deleted flag", + "name": "isDeleted", + "type": "boolean" + } + ] + } + ] + } +``` + +The key for the bloom filter record would be an encoded string representing the partition and base file combo. The +partition and the file names are converted to deterministic hash based IDs, and then they are base64 encoded. Hash based +IDs are easy to generate for the incoming new inserts records and for the lookup for the updated records. It doesn't +need any dictionary to be added for the reverse lookups. Hash bits are chosen based on the cardinality and the collision +probability desired for the support max scale deployment. Base64 encoding the hash IDs further reduces the on-disk +storage space for these keys. + +``` +key = base64_encode(concat(hash64(partition name), hash128(file name))) +``` + +Bloom filter partition + +### MetaIndex/ColumnStats: + +Another new partition `column_stats` will also be added under the metadata table to make the record key lookup code path +much more performant. This metadata index also helps in the data skipping (please look at RFC-27 for more details). In +the context of faster record key lookups for the update use cases, proposing `column_stats` index to be used for +file pruning when generating the file to candidate keys mapping for the update records.The existing metadata payload +schema will be extended and shared for this partition also. The type field will be used to detect the column stats +payload record. Here is the schema for the column stats payload record. + +``` + { + "doc": "Metadata Index of column statistics for all data files in the user table", + "name": "ColumnStatsMetadata", + "type": [ + "null", + { + "doc": "Data file column statistics", + "name": "HoodieColumnStats", + "type": "record", + "fields": [ + { + "doc": "File name for which this column statistics applies", + "name": "fileName", + "type": [ + "null", + "string" + ] + }, + { + "doc": "Minimum value in the range. Based on user data table schema, we can convert this to appropriate type", + "name": "minValue", + "type": [ + "null", + "string" + ] + }, + { + "doc": "Maximum value in the range. Based on user data table schema, we can convert it to appropriate type", + "name": "maxValue", + "type": [ + "null", + "string" + ] + }, + { + "doc": "Total count of values", + "name": "valueCount", + "type": [ + "null", + "long" + ] + }, + { + "doc": "Total count of null values", + "name": "nullCount", + "type": [ + "null", + "long" + ] + }, + { + "doc": "Total storage size on disk", + "name": "totalSize", + "type": [ + "null", + "long" + ] + }, + { + "doc": "Total uncompressed storage size on disk", + "name": "totalUncompressedSize", + "type": [ + "null", + "long" + ] + }, + { + "doc": "Column range entry valid/deleted flag", + "name": "isDeleted", + "type": "boolean" + } + ] + } + ] + } +``` + +Column stats records hold key ranges (min and max) for the file. The key for the column stat record would be an +encoded string representing the tuple set of column name, partition name and the base file. The string names of +these fields are converted to deterministic hash based IDs, and then they are base64 encoded, just like the +bloom filter key. + +``` +key = base64_encode(concat(hash64(column name), hash64(partition name), hash128(file name))) +``` + +While Hash based IDs have quite a few desirable properties in the context of Hudi index lookups, there is an impact +on the column level schema changes though. Refer to [Schema Evolution](#Schema-Evolution) section for more details. + +Below picture gives a pictorial representation of Column stats partition in metadata table. +Column Stats Partition + +### Metadata Index lookup: + +For the incoming upsert records, given their keys, tag their current location. The new algorithm for the +index lookup would be + +1. Generate the list of partitions and the list of keys under each partition to be looked up +2. For all the involved partitions, load all its file list +3. Level 1: Range pruning using `column_stats` index: + 1. For each of the record key, generate the column stats index lookup key based on the tuple + (__hoodie_record_key, partition name, file path) + 2. Meta index lookup with the above key and if available get the value payload with the column stats details + 3. Prune the partition and its candidate files based on the range comparisons +4. Level 2: Record pruning using `bloom_filter` index: + 1. From the shortlisted file candidates per partition, generate bloom filter index lookup key based on the tuple + (partition name, file path) + 2. Meta index lookup with the above key to load the base file bloom filter + 3. Bloom filter lookup for the record key to generate the candidate keys that are probably available in the base file +5. Level 3: Record validation + 1. Given the list of files and their candidate keys from above pruning, do the actual file lookup to confirm the keys + 2. Return the location (file id) of the final matching keys + +### Schema Evolution: + +HashID based key are deterministically generated from the tuple input. That is, for the tuple consisting of column name, +partition name and file name, the key generated would always be the same. So, a table where the schema gets changed over +time would have an impact on the keys already generated. The most common schema evolution use cases like change of +column type, adding a new column are not affected though. Other relatively uncommon use cases like column name rename, +dropping a column and adding a column with dropped name would have indices referring them more than needed. This would +lead to the index lookup matching stale/new records across evolved schemas. + +To avoid looking up stale/new index records, here are the design options we have: +1. (Preferred) Query rewrite / Result recordset pruning + 1. Schema evolution layer should introduce query rewrite stage to detect evolved schemas for the input query and + optionally include additional predicates to the query + 2. The resultant recordset can also be pruned based on the commit time and the schema change time +3. Making input tuple set schema aware + 1. Along with column name, partition name and file path, a version/tag can also be added to make the key + generated very schema specific. But, this choice has a performance impact as the lookup now has to be more of a + prefix based instead of pointed lookups. That is, index lookup have to return records for all the versions/tags + and pruning on top of this have to be done. + +## Implementation + +1. No change to the HoodieIndex public interface. +``` + /** + * Looks up the index and tags each incoming record with a location of a file that contains + * the row (if it is actually present). + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) + public abstract HoodieData> tagLocation( + HoodieData> records, HoodieEngineContext context, + HoodieTable hoodieTable) throws HoodieIndexException; +``` +2. HoodieBloomIndex::explodeRecordsWithFileComparisons() will be extended to check for a new config `hoodie.metadata.file_pruning.enable` + and if enabled, metadata table based column stat will be used for file pruning based on key ranges. +3. + + +### Writer flow: +Let's walk through the writer flow to update these partitions. + +Whenever a new commit is getting applied to metadata table, we do the following.
    +1. Files partition - prepare records for adding +2. Column_stats partition - prepare records for adding +[ColumnIndexID][PartitionIndexID][FileIndexID] => ColumnStats +This involves reading the base file footers to fetch min max values for each column +3. Bloom_filter partition - prepare records for adding +[PartitionIndexID][FileIndexID] => BloomFilter +This involves reading the base file footers. +We can amortize the cost across (2) and (3) and just read it once and prepare/populate records for both partitions. +4. Commit all these records to metadata table. + +We need to ensure we have all sufficient info in WriteStatus get sent to metadata writer for every commit. + +### Reader flow: +When a new batch of write is ingested into Hudi, we need to tag the records with their +original file group location. And this index will leverage both the partitions to deduce the +record key => file name mappings. Refer to Metadata Index lookup section for more details. + +## Rollout/Adoption Plan +* Release 0.10.0 is a FlagDay release. Mean, the old metadata table will be wiped out and a new one will be built. +* Metadata Index feature is planning for 0.10.x version. Any preparatory changes/features ( + like `Metadata new indexing for existing tables`, RFC proposal and doc pending) that are needed to have this feature + in the later minor release need to be rolled out as part of 0.10.0 +* TODO: More details on the rollout plan + +## Test Plan +* Functionality + * Tag location for existing keys + * Tag location for non-existing keys +* Performance + * Prove Metadata based indices are helping upsert use cases +* Upgrade +* TODO: More details on the test plan diff --git a/rfc/rfc-38/1.png b/rfc/rfc-38/1.png new file mode 100644 index 0000000000000..44238888c9b81 Binary files /dev/null and b/rfc/rfc-38/1.png differ diff --git a/rfc/rfc-38/rfc-38.md b/rfc/rfc-38/rfc-38.md new file mode 100644 index 0000000000000..d007bd0b663da --- /dev/null +++ b/rfc/rfc-38/rfc-38.md @@ -0,0 +1,283 @@ + + +# RFC-38: Spark Datasource V2 Integration + +## Proposers + +- @leesf + +## Approvers +- @vinothchandar +- @xishiyan +- @YannByron + +## Status + +JIRA: https://issues.apache.org/jira/browse/HUDI-1297 + +## Abstract + +Today, Hudi still uses V1 api and relies heavily on RDD api to index, repartition and so on given the flexibility of RDD api, +it works fine in v1 api, using datasource V1 api, Hudi provides complete read/write, update, +and small file auto handling capabilities, all things work well. +However, with the continuous development and evolving of datasource V2 api, +the datasource v2 api has stabilized.Taking into account the datasource v1 api is too old and the spark community +no longer spends more resources to maintain v1 api, so consider migrating to DataSource V2 api, +and use more pushdown filters provided by V2 api and +integrate with [RFC-27](https://cwiki.apache.org/confluence/display/HUDI/RFC-27+Data+skipping+index+to+improve+query+performance) +to provide more powerful query capabilities. Also we could leverage it after V2 api get evolved or optimized again. + + +## Background + +The current Hudi read and write paths use DataSource V1 api, and the implementation class is `DefaultSource` + +```scala +/** +* Hoodie Spark Datasource, for reading and writing hoodie tables +* +*/ +class DefaultSource extends RelationProvider +with SchemaRelationProvider +with CreatableRelationProvider +with DataSourceRegister +with StreamSinkProvider +with StreamSourceProvider +with Serializable { +... +} +``` + +As for writing(batch write), the following method will be called. +```scala +override def createRelation(sqlContext: SQLContext, +mode: SaveMode, +optParams: Map[String, String], +df: DataFrame): BaseRelation = { +val parameters = HoodieWriterUtils.parametersWithWriteDefaults(optParams) +val translatedOptions = DataSourceWriteOptions.translateSqlOptions(parameters) +val dfWithoutMetaCols = df.drop(HoodieRecord.HOODIE_META_COLUMNS.asScala:_*) + + if (translatedOptions(OPERATION.key).equals(BOOTSTRAP_OPERATION_OPT_VAL)) { + HoodieSparkSqlWriter.bootstrap(sqlContext, mode, translatedOptions, dfWithoutMetaCols) + } else { + HoodieSparkSqlWriter.write(sqlContext, mode, translatedOptions, dfWithoutMetaCols) + } + new HoodieEmptyRelation(sqlContext, dfWithoutMetaCols.schema) +} +``` + +Regarding querying, the following method will return a `BaseRelation`(if not provide schema) + +```scala +override def createRelation(sqlContext: SQLContext, +parameters: Map[String, String]): BaseRelation = { +createRelation(sqlContext, parameters, null) +} +``` + +For streaming writing and reading, DefaultSource#createSink and DefaultSource#createSource are called respectively. +In 0.9.0 version , the bulk_insert row mode was introduced to speed up bulk_insert, which implements the `SupportsWrite` v2 api and uses `HoodieDataSourceInternalTable` for writing, +right now only bulk_insert operation is supported. + +## Implementation + +Spark provides a complete V2 api, such as `CatalogPlugin`, `SupportsWrite`, `SupportsRead`, and various pushdown filters, +such as `SupportsPushDownFilters`, `SupportsPushDownAggregates`, `SupportsPushDownRequiredColumns` + +We would define the key abstraction of call `HoodieInternalV2Table`, which inherits the `Table`, `SupportsWrite`, `SupportsRead` +interfaces to provide writing and reading capabilities. + +### Writing Path + +Hudi relies heavily on some RDD APIs on write path, such as the indexing to determine where the record is update or insert, +this refactoring work is relatively large or impossible to migrate to v2 write path under datasource v2 api. +So we can fallback to write to v1 since Spark provides the `V1Write` interface to bridge the V1 and V2 api in 3.2.0 + +The writing path code snippet is below + +```scala +class HoodieInternalV2Table extends Table with SupportsWrite with V2TableWithV1Fallback { + + override def name(): String = { + // + } + + override def schema(): StructType = { + // get hudi table schema + } + + override def partitioning(): Array[Transform] = { + // get partitioning of hudi table. + } + + override def capabilities(): Set[TableCapability] = { + // Set(BATCH_WRITE, BATCH_READ,TRUNCATE,...) + } + + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + // HoodieV1WriteBuilder + } +} +``` + +The definition of `HoodieV1WriteBuilder` shows below. + +```scala +private class HoodieV1WriteBuilder(writeOptions: CaseInsensitiveStringMap, + hoodieCatalogTable: HoodieCatalogTable, + spark: SparkSession) + extends SupportsTruncate with SupportsOverwrite with ProvidesHoodieConfig { + + override def truncate(): HoodieV1WriteBuilder = { + this + } + + override def overwrite(filters: Array[Filter]): WriteBuilder = { + this + } + + override def build(): V1Write = new V1Write { + override def toInsertableRelation: InsertableRelation = { + //IntertableRelation + } + } +} +``` + +### Querying path + +For v2 querying, Spark provides various pushdown filters, such as `SupportsPushDownFilters`, `SupportsPushDownAggregates`, +`SupportsPushDownRequiredColumns`, `SupportsRuntimeFiltering` and so on, which is more clear and flexible than v1 interface. +Also, v2 interface provides the capability to read the columnar format file such as parquet and orc format file, one more thing +is that v2 interface provides the capability to split and define the number of partitions for users, which provides the possibility +to split more accurate splits and accelerate query speed on Hudi side. +However, for querying, in first stage we also fallback to v1 read path, which means we need convert +`DataSourceV2Relation` to `DefaultSource` in analysis stage to make the changes well controlled. +The code snippet shows below, the `HoodieSpark3Analysis` should be injected if spark version is equal or larger than 3.2.0. + +```scala + +case class HoodieSpark3Analysis(sparkSession: SparkSession) extends Rule[LogicalPlan] + with SparkAdapterSupport with ProvidesHoodieConfig { + + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown { + case dsv2@DataSourceV2Relation(d: HoodieInternalV2Table, _, _, _, _) => + val output = dsv2.output + val catalogTable = if (d.catalogTable.isDefined) { + Some(d.v1Table) + } else { + None + } + val relation = new DefaultSource().createRelation(new SQLContext(sparkSession), + buildHoodieConfig(d.hoodieCatalogTable)) + LogicalRelation(relation, output, catalogTable, isStreaming = false) + } +} + +``` +In the second stage, we would make use of v2 reading interface and define `HoodieBatchScanBuilder` to provide querying +capability. The workflow of querying process is shown in below figure. +`PartitionReaderFactory` located in the Driver and the `PartitionReader` located in the Executor. + +![](./1.png) + +The querying path code sample is below + +```scala +class HoodieBatchScanBuilder extends ScanBuilder with SupportsPushDownFilters with SupportsPushDownRequiredColumns { +override def build(): Scan = { +// HoodieScan +} + +override def pushFilters(filters: Array[Filter]): Array[Filter] = { +// record the filters +} + +override def pushedFilters(): Array[Filter] = { +// pushed filters +} + +override def pruneColumns(requiredSchema: StructType): Unit = { +// record the pruned columns +} +} +``` + +### Table Meta Management + +Implementing the `CatalogPlugin` interface to manage the metadata of the Hudi table and +define the core abstraction called `HoodieCatalog`, +and the code sample is below. + +```scala +class HoodieCatalog extends DelegatingCatalogExtension +with StagingTableCatalog { + override def loadTable(ident: Identifier): Table = { + // HoodieDatasouceTable + } + + override def createTable(ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): Table = { + // create hudi table + } + + override def dropTable(Identifier ident): Boolean = { + // drop hudi table + } + + override def alterTable(Identifier ident, TableChange... changes): Table = { + // check schema compability + // HoodieDatasouceTable + } + + override def stageReplace(ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = { + // StagedHoodieTable + } + + override def stageCreateOrReplace(ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = { + // StagedHoodieTable + } +} +``` + +Users would set the spark session config spark.sql.catalog.spark_catalog to org.apache.hudi.catalog.HoodieCatalog to load the HoodieCatalogto manage hudi tables. + +## Rollout/Adoption Plan + +- What impact (if any) will there be on existing users? + +there is no impact on existing users, but users would specify the new catalog to manager hudi tables or other tables. + +- If we are changing behavior how will we phase out the older behavior? + +we should keep compatibility of v1 version and make it transparent for users to migrate to v2 api. + +## Test Plan + +[ ] PoC for catalog plugin +[ ] PoC for writing path with UTs +[ ] Poc for querying path with UTs +[ ] E2E tests +[ ] Benchmark for v1 and v2 writing and querying \ No newline at end of file diff --git a/rfc/rfc-40/Hudi_Connector.png b/rfc/rfc-40/Hudi_Connector.png new file mode 100644 index 0000000000000..ddb388da4548d Binary files /dev/null and b/rfc/rfc-40/Hudi_Connector.png differ diff --git a/rfc/rfc-40/rfc-40.md b/rfc/rfc-40/rfc-40.md new file mode 100644 index 0000000000000..2525071551264 --- /dev/null +++ b/rfc/rfc-40/rfc-40.md @@ -0,0 +1,282 @@ + + +# RFC-40: Hudi Connector for Trino + +## Proposers + +- @codope +- @yihua + +## Approvers + +- @bvaradar +- @vinothchandar + +## Status + +JIRA: https://issues.apache.org/jira/browse/HUDI-2687 + +> Please keep the status updated in `rfc/README.md`. + +## Abstract + +Today, Hudi supports snapshot queries on Copy-On-Write (COW) tables and read-optimized queries on Merge-On-Read (MOR) +tables with Trino, through the input format based integration in the Hive connector. This approach has known performance +limitations with very large tables. Moreover, as Hudi keeps getting better, a new plugin to provide access to Hudi data +and metadata will help in unlocking capabilities such as metadata-based listing, full schema evolution, etc. for the +Trino users. A separate Hudi connector would also allow its independent evolution without having to worry about +hacking/breaking the Hive connector. A separate connector also falls in line with our vision when we think of a +standalone timeline server or a lake cache to balance the tradeoff between writing and querying. + +## Background + +The current Trino integration relies on a custom annotation `@UseFileSplitsFromInputFormat`. Any input format that has +this annotation would fetch splits by invoking the corresponding input format’s `getSplits()` method instead of Trino's +Hive connector native split loading logic. For instance, realtime queries on Hudi tables queried via Trino, this would +be a simple call to `HoodieParquetRealtimeInputFormat.getSplits()`. This approach has known performance limitations +due to the way Trino's split loading is designed, causing redundant Hudi table metadata listing while loading splits. +This issue has been fixed in Presto and the work to upstream those changes to Trino is [in progress](https://github.com/trinodb/trino/pull/9641). + +A connector enables Trino to communicate with external data sources. The connector interface is composed of four parts: +the Metadata API, Data Location API, Data Source API, and Data Sink API. These APIs are designed to allow performant +implementations of connectors within the environment of Trino's distributed execution engine. For an overview of the +Trino architecture please see [Trino concepts](https://trino.io/docs/current/overview/concepts.html). + +### Trino query execution model + +When Trino executes a query, it does so by breaking up the execution into a hierarchy of **stages**. A single stage is +implemented as a series of **tasks** distributed over a network of Trino workers. Tasks operate on **splits**, which are +partitions of a larger data set. Tasks at the source stage produce data in the form of **pages**, which are a collection +of rows in columnar format. These pages flow to other intermediate downstream stages. + +## Implementation + +Trino provides a service provider interface (SPI), which is a type of API used to implement a connector. By implementing +the SPI in a connector, Trino can use standard operations internally to connect to any data source and perform +operations on any data source. The connector takes care of the details relevant to the specific data source. + +Hudi connector will implement three parts of the API: + +- Operations to fetch table/view/schema metadata. +- Operations to produce logical units of data partitioning, so that Trino can parallelize reads and writes. +- Data sources and sinks that convert the source data to/from the in-memory format expected by the query engine. + +Hudi connector will be registered as a plugin, which will be loaded by Trino server at startup. The entry point will +be `HudiPlugin`, an implementation of the `Plugin` interface. Instances of Hudi connector are created by a +ConnectorFactory instance which is created when Trino calls `getConnectorFactory()` on the plugin. +A class-diagrammatic view of the different components is shown below. +![](Hudi_Connector.png) + +### Operations to fetch table/view/schema metadata + +The `ConnectorMetadata` interface provides important methods that are responsible for allowing Trino to look at lists of +schemas, lists of tables, lists of columns, and other metadata about a particular data source. The implementation of +this interface will create the `HoodieTableMetaClient` and pass it to the connector table handle through which Trino +can access metadata of a Hudi table. + + +### Operations to produce logical units of data partitioning + +We will need to implement the `ConnectorSplit` and `ConnectorSplitManager` interfaces. Hudi splits will be similar to +how Hive connector describes splits in the form of a path to a file with offset and length that indicate which part of +the file needs to be processed. + +```java +public class HudiSplit + implements ConnectorSplit { + private final String path; + private final long start; + private final long length; + private final long fileSize; + private final List addresses; + private final TupleDomain predicate; + private final List partitionKeys; + private final SplitWeight splitWeight; +} +``` + +The split manager will partition the data for a table into the individual chunks that Trino will distribute to workers +for processing. This is where the partition loader logic will reside. While listing the files for each Hudi partition, +the split manager will create one or more splits per file. Additionally, split generation is dynamic based on size to +futher improve the performance (see [query planning optimization](#query-planning-optimization) for more details). + +During query execution, the Trino coordinator tracks all splits available for processing and the locations where tasks +are running on workers and processing splits. As tasks finish processing and are producing more splits for downstream +processing, the coordinator continues to schedule tasks until no splits remain for processing. Once all splits are +processed on the workers, all data is available, and the coordinator can make the result available to the client. + +To support file listing for different query modes in Hudi, i.e., Read Optimized, Snapshot, and Incremental, Hudi +connector provides the abstraction of `HudiFileListing` which can be extended to contain custom logic of generating the +particular partitions to scan for a query and file listing for a partition. `HudiFileListing` abstraction relies on +`HudiPartitionInfo` to get the information of a partition, including relative partition path, partition name base on +Hive Metastore, key-value pairs of this partition, and predicates for the partition columns. We plan to support +Read Optimized query for COW table first. In the future, we'd like to merge the file listing abstraction into Hudi repo +so that such common file listing functionality can be reused across different query engines. + +```java +public abstract class HudiFileListing { + public abstract List getPartitionsToScan(); + public abstract List listStatus(HudiPartitionInfo partitionInfo); +} + +public abstract class HudiPartitionInfo { + protected final Table table; + protected final List partitionColumnHandles; + protected final TupleDomain constraintSummary; + // Relative partition path + protected String relativePartitionPath; + // Hive partition name containing partition column key-value pairs + protected String hivePartitionName; + // List of partition keys containing column key-value pairs + protected List hivePartitionKeys; +} +``` + +### Data source + +As mentioned in the query execution model, tasks in the source stage produce data in the form of pages. The Connector +Data Source API returns pages when it is passed a split, and operators typically consume input pages, perform +computation, and produce output pages. This is where we will implement `ConnectorPageSourceProvider` interface to create +page source. + +```java +public class HudiPageSourceProvider + implements ConnectorPageSourceProvider { + private final HdfsEnvironment hdfsEnvironment; + private final FileFormatDataSourceStats fileFormatDataSourceStats; + private final ParquetReaderOptions parquetReaderOptions; + private final DateTimeZone timeZone; +} +``` + +We could have different page sources for different base file formats like parquet, orc and avro. To adapt to these +different formats, We add an abstraction named `HudiPageSourceCreator` so that different base file format has its +corresponding logic to create `ConnectorPageSource` instance. For the parquet format, we plan to implement +`HudiParquetPageSourceCreator` by extending `HudiPageSourceCreator` and reuse the `ParquetPageSource` creation in the +Hive connector. This has the advantage of using Trino's custom `ParquetReader` that can efficiently skip data sections +by using statistics in file headers/footers. This is also where we will handle the column projections and build +predicates for the parquet reader. + +```java +public abstract class HudiPageSourceCreator { + public abstract ConnectorPageSource createPageSource( + Configuration configuration, + ConnectorIdentity identity, + List regularColumns, + HudiSplit hudiSplit); +} +``` + +### Snapshot queries on MOR table + +This requires merging base file and log files. +One way is to use the `HoodieRealtimeRecordReader` which can do compacted reading. +However, this means we will have to give up Trino's optimized parquet reader. +Another way is to enumerate the merged splits and use the native reader. +This can be done in `HoodieRealtimeInputFormatUtils#getRealtimeSplits()` which is invoked in `HoodieParquetRealtimeInputFormat`. +We can reuse this logic for reading MOR table via the connector. + +In summary, Trino coordinator uses the metadata and split manager APIs to gather information about the table and partitions to +generate a query plan and logical splits of the table contents. Each split is processed by a task in the Trino worker. +Here, workers invoke the page source APIs as tasks produce data in the form of pages. +Subsequently, native (parquet) reader read the block of pages while executing the query. + +## Query Planning Optimization + +We make several design decisions to optimize the query planning in the Hudi connector. + +### Background loading of Hudi splits + +Simply fetching all Hudi splits in a single thread synchronously in the `HudiSplitSource` significantly degrades the +query performances since the Trino coordinator cannot hand out the splits to the workers for execution until all the +splits are generated. To remove the bottleneck, we add a background split loader, `HudiSplitBackgroundLoader`, to +load the Hudi splits asynchronously. Once the query planning begins, the background split loader is initialized, and +starts to run immediately, regardless of whether the coordinator asks for the next batch of splits (i.e., +`HudiSplitSource::getNextBatch`). The background load keeps generating new splits to an internal connector split queue. +When the coordinator asks for the next batch of splits by calling `HudiSplitSource::getNextBatch`, the method fetches +the available splits from the internal connector split queue. + +The background split loader internally has a pipeline of processing: +- Fetching partition information: this step collects the information of all the partitions that need to be read for +file listing. +- Listing files in partitions: this step lists all the files per partition. Since each partition is independent of +another, we list each partition in a concurrent manner. To improve performance of file listing, there is a thread pool +so each thread takes a partition from a queue and does the file listing, until all the partitions are processed. +- Generating splits from each file: this step generates the splits from the files listed in the second step. Similarly, +there is a thread pool so each thread takes a file from a queue and does the split generation. + +The background loader keeps track of the progress of split generation and reports the status to `HudiSplitSource`. + +### Batching Hive metastore calls + +It is expensive to make RPC calls to Hive metastore to fetch information. For example, using `HiveMetastore::getPartition` +to get the information of a single partition takes around 100ms. Parallelizing the RPC calls to metastore is not enough +to meet the performance requirements, e.g., it takes 5-6 seconds for getting the information of 200 partitions using +`HiveMetastore::getPartition`, even with parallelism, due to the bottleneck at the Hive metastore serving the calls. + +To address this issue, we batch the partition information fetching using `HiveMetastore::getPartitionsByNames`. Instead +of fetching the information of one partition per call, such a method provides the ability to fetch the information of +multiple partitions per call. In this way, the number of calls to Hive metastore to fetch the information of all +partitions can be drastically reduced. We use exponential increased batch size, starting from 10, with the maximum of +100, i.e., with the batch size sequence of `"10, 20, 40, 80, 100, 100, ..."`. Using this optimization, it only takes +around 500ms to get the information of 200 partitions. + +### Dynamic size-based split weight + +Trino schedules a batch of splits for the page source provider to create pages. Trino decides the number +of splits in the batch using a quota of 100. By default, each split has a uniform weight of 1, thus each batch has +100 splits. If the splits are small in size, there may not be enough splits in the workers for processing, leading +to inefficient execution. Like Hive split, Hudi split incorporates the size-based split weight so that smaller splits +get lower weights. Trino then packs more splits in a batch if each has a smaller size, thus guaranteeing that each +batch has enough data to process. + +### Improving listing +In order to improve listing, we assume that the path exists, +and so we bypass the `FileSystem#exists` check in `AbstractHoodieTableFileSystemView` while fetching latest base files. +The connector will also support metadata-based listing which will retrieve partition listings from Hudi's internal metadata table. +This should further help improve the performance. + +## Rollout/Adoption Plan + +- What impact (if any) will there be on existing users? + +There will be no impact on existing users because this is a new connector. It does not change the behavior of current +integration through the existing Hive connector. It gives users more choice. + +- What do we lose if we move away from the Hive connector? + +Hive connector takes advantage of [caching](https://trino.io/docs/current/connector/hive-caching.html) to reduce load on +object storage. We will need to use or implement a cache file system like [Rubix](https://github.com/qubole/rubix) that +is optimized for columnar formats and object stores. This is being tracked by [HUDI-3339](https://issues.apache.org/jira/browse/HUDI-3339). + +- If we need special migration tools, describe them here. + +The implementation assumes that Hudi tables are synced to Hive. There is no Trino support for migrating Hive tables to +Hudi, so we need to either use the Hudi APIs or write custom Spark jobs to migrate the tables to Hudi. + +- When will we remove the existing behavior? + +We are not proposing to remove the existing behavior. We hope that we will have a critical mass of users who will like +to use the new Hudi connector. That said, we whould continue to support the current integration. + +## Test Plan + +- [x] POC for snapshot query on COW table +- [x] Unit tests for the connector +- [ ] Product integration tests +- [x] Benchmark snapshot query for large tables diff --git a/rfc/rfc-46/rfc-46.md b/rfc/rfc-46/rfc-46.md new file mode 100644 index 0000000000000..8b0feff2343db --- /dev/null +++ b/rfc/rfc-46/rfc-46.md @@ -0,0 +1,159 @@ + +# RFC-46: Optimize Record Payload handling + +## Proposers + +- @alexeykudinkin + +## Approvers + - @vinothchandar + - @nsivabalan + - @xushiyan + +## Status + +JIRA: https://issues.apache.org/jira/browse/HUDI-3217 + +> Please keep the status updated in `rfc/README.md`. + +## Abstract + +Avro historically has been a centerpiece of the Hudi architecture: it's a default representation that many components expect +when dealing with records (during merge, column value extractions, writing into storage, etc). + +While having a single format of the record representation is certainly making implementation of some components simpler, +it bears unavoidable performance penalty of de-/serialization loop: every record handled by Hudi has to be converted +from (low-level) engine-specific representation (`Row` for Spark, `RowData` for Flink, `ArrayWritable` for Hive) into intermediate +one (Avro), with some operations (like clustering, compaction) potentially incurring this penalty multiple times (on read- +and write-paths). + +As such, goal of this effort is to remove the need of conversion from engine-specific internal representations to Avro +while handling records. + +## Background + +Historically, Avro has settled in as de-facto intermediate representation of the record's payload since the early days of Hudi. +As project matured and the scale of the installations grew, necessity to convert into an intermediate representation quickly +become a noticeable bottleneck in terms of performance of critical Hudi flows. + +At the center of it is the hierarchy of `HoodieRecordPayload`s, which is used to hold individual record's payload +providing an APIs like `preCombine`, `combineAndGetUpdateValue` to combine it with other record using some user-defined semantic. + +## Implementation + +### Revisiting Record Classes Hierarchy + +To achieve stated goals of avoiding unnecessary conversions into intermediate representation (Avro), existing Hudi workflows +operating on individual records will have to be refactored and laid out in a way that would be _unassuming about internal +representation_ of the record, ie code should be working w/ a record as an _opaque object_: exposing certain API to access +crucial data (precombine, primary, partition keys, etc), but not providing the access to the raw payload. + +Having existing workflows re-structured in such a way around a record being an opaque object, would allow us to encapsulate +internal representation of the record w/in its class hierarchy, which in turn would allow for us to hold engine-specific (Spark, Flink, etc) +representations of the records w/o exposing purely engine-agnostic components to it. + +Following (high-level) steps are proposed: + +1. Promote `HoodieRecord` to become a standardized API of interacting with a single record, that will be + 1. Replacing all accesses from `HoodieRecordPayload` + 2. Split into interface and engine-specific implementations (holding internal engine-specific representation of the payload) + 3. Implementing new standardized record-level APIs (like `getPartitionKey` , `getRecordKey`, etc) + 4. Staying **internal** component, that will **NOT** contain any user-defined semantic (like merging) +2. Extract Record Combining (Merge) API from `HoodieRecordPayload` into a standalone, stateless component (engine). Such component will be + 1. Abstracted as stateless object providing API to combine records (according to predefined semantics) for engines (Spark, Flink) of interest + 2. Plug-in point for user-defined combination semantics +3. Gradually deprecate, phase-out and eventually remove `HoodieRecordPayload` abstraction + +Phasing out usage of `HoodieRecordPayload` will also bring the benefit of avoiding to use Java reflection in the hot-path, which +is known to have poor performance (compared to non-reflection based instantiation). + +#### Combine API Engine + +Stateless component interface providing for API Combining Records will look like following: + +```java +interface HoodieRecordCombiningEngine { + + default HoodieRecord precombine(HoodieRecord older, HoodieRecord newer) { + if (spark) { + precombineSpark((SparkHoodieRecord) older, (SparkHoodieRecord) newer); + } else if (flink) { + // precombine for Flink + } + } + + /** + * Spark-specific implementation + */ + SparkHoodieRecord precombineSpark(SparkHoodieRecord older, SparkHoodieRecord newer); + + // ... +} +``` +Where user can provide their own subclass implementing such interface for the engines of interest. + +#### Migration from `HoodieRecordPayload` to `HoodieRecordCombiningEngine` + +To warrant backward-compatibility (BWC) on the code-level with already created subclasses of `HoodieRecordPayload` currently +already used in production by Hudi users, we will provide a BWC-bridge in the form of instance of `HoodieRecordCombiningEngine`, that will +be using user-defined subclass of `HoodieRecordPayload` to combine the records. + +Leveraging such bridge will make provide for seamless BWC migration to the 0.11 release, however will be removing the performance +benefit of this refactoring, since it would unavoidably have to perform conversion to intermediate representation (Avro). To realize +full-suite of benefits of this refactoring, users will have to migrate their merging logic out of `HoodieRecordPayload` subclass and into +new `HoodieRecordCombiningEngine` implementation. + +### Refactoring Flows Directly Interacting w/ Records: + +As was called out prior to achieve the goal of being able to sustain engine-internal representations being held by `HoodieRecord` +class w/o compromising major components' neutrality (ie being engine-agnostic), such components directly interacting w/ +records' payloads today will have to be refactored to instead interact w/ standardized `HoodieRecord`s API. + +Following major components will be refactored: + +1. `HoodieWriteHandle`s will be + 1. Accepting `HoodieRecord` instead of raw Avro payload (avoiding Avro conversion) + 2. Using Combining API engine to merge records (when necessary) + 3. Passes `HoodieRecord` as is to `FileWriter` +2. `HoodieFileWriter`s will be + 1. Accepting `HoodieRecord` + 2. Will be engine-specific (so that they're able to handle internal record representation) +3. `HoodieRealtimeRecordReader`s + 1. API will be returning opaque `HoodieRecord` instead of raw Avro payload + + +## Rollout/Adoption Plan + + - What impact (if any) will there be on existing users? + - Users of the Hudi will observe considerably better performance for most of the routine operations: writing, reading, compaction, clustering, etc due to avoiding the superfluous intermediate de-/serialization penalty + - By default, modified hierarchy would still leverage + - Users will need to rebase their logic of combining records by creating a subclass of `HoodieRecordPayload`, and instead subclass newly created interface `HoodieRecordCombiningEngine` to get full-suite of performance benefits + - If we are changing behavior how will we phase out the older behavior? + - Older behavior leveraging `HoodieRecordPayload` for merging will be marked as deprecated in 0.11, and subsequently removed in 0.1x + - If we need special migration tools, describe them here. + - No special migration tools will be necessary (other than BWC-bridge to make sure users can use 0.11 out of the box, and there are no breaking changes to the public API) + - When will we remove the existing behavior + - In subsequent releases (either 0.12 or 1.0) + +## Test Plan + +This refactoring will not be modifying any existing Hudi semantics other than the aforementioned, and as such to guarantee preservation of the +logical correctness of the many flows that will be affected by the refactoring we will rely on the existing set of test-suites. + +Nevertheless, we will run corresponding set of benchmarks stressing the flows being affected by the refactoring to validate +that there are considerable performance advantage of abandoning conversion into intermediate representation completely. \ No newline at end of file diff --git a/style/scalastyle.xml b/style/scalastyle.xml index 2ba4042be0ca4..74d7b9d73a203 100644 --- a/style/scalastyle.xml +++ b/style/scalastyle.xml @@ -113,7 +113,7 @@ - +