diff --git a/docker/demo/config/test-suite/templates/clustering.yaml.template b/docker/demo/config/test-suite/templates/clustering.yaml.template index 7b33423e73d5c..fab10ecf4cab9 100644 --- a/docker/demo/config/test-suite/templates/clustering.yaml.template +++ b/docker/demo/config/test-suite/templates/clustering.yaml.template @@ -13,6 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# yaml to test clustering dag_name: NAME-clustering.yaml dag_rounds: clustering_num_iterations dag_intermittent_delay_mins: clustering_delay_in_mins diff --git a/docker/demo/config/test-suite/templates/long-running.yaml.template b/docker/demo/config/test-suite/templates/long_test_suite.yaml.template similarity index 90% rename from docker/demo/config/test-suite/templates/long-running.yaml.template rename to docker/demo/config/test-suite/templates/long_test_suite.yaml.template index b6392967e3d0e..0715eb27e10e8 100644 --- a/docker/demo/config/test-suite/templates/long-running.yaml.template +++ b/docker/demo/config/test-suite/templates/long_test_suite.yaml.template @@ -13,8 +13,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# Long running test suite which cleans up input after every round of a dag. Which means, validation +# happens only for 1 round of dag everytime (as input is cleaned up) dag_name: NAME-long-running-multi-partitions.yaml -dag_rounds: num_iterations +dag_rounds: long_num_iterations dag_intermittent_delay_mins: delay_in_mins dag_content: first_insert: @@ -82,7 +85,7 @@ dag_content: deps: second_hive_sync last_validate: config: - execute_itr_count: 50 + execute_itr_count: long_num_iterations validate_clean: true validate_archival: true type: ValidateAsyncOperations diff --git a/docker/demo/config/test-suite/templates/medium_test_suite.yaml.template b/docker/demo/config/test-suite/templates/medium_test_suite.yaml.template new file mode 100644 index 0000000000000..b499a92fa692d --- /dev/null +++ b/docker/demo/config/test-suite/templates/medium_test_suite.yaml.template @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Long running test suite which validates entire input after every dag. Input accumulates and so validation +# happens for entire dataset. +dag_name: NAME-long-running-multi-partitions.yaml +dag_rounds: medium_num_iterations +dag_intermittent_delay_mins: delay_in_mins +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 2 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: first_hive_sync + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: true + delete_input_data: false + type: ValidateDatasetNode + deps: second_hive_sync + last_validate: + config: + execute_itr_count: medium_num_iterations + validate_clean: true + validate_archival: true + type: ValidateAsyncOperations + deps: second_validate diff --git a/docker/demo/config/test-suite/templates/sanity.yaml.template b/docker/demo/config/test-suite/templates/sanity.yaml.template new file mode 100644 index 0000000000000..eae83b6af38ad --- /dev/null +++ b/docker/demo/config/test-suite/templates/sanity.yaml.template @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Sanity yaml to test simple operations. +dag_name: NAME-sanity.yaml +dag_rounds: 1 +dag_intermittent_delay_mins: delay_in_mins +dag_content: + first_insert: + config: + record_size: 1000 + num_partitions_insert: 5 + repeat_count: 1 + num_records_insert: 1000 + type: InsertNode + deps: none + second_insert: + config: + record_size: 1000 + num_partitions_insert: 50 + repeat_count: 1 + num_records_insert: 10000 + deps: first_insert + type: InsertNode + third_insert: + config: + record_size: 1000 + num_partitions_insert: 2 + repeat_count: 1 + num_records_insert: 300 + deps: second_insert + type: InsertNode + first_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: third_insert + first_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: first_hive_sync + first_upsert: + config: + record_size: 1000 + num_partitions_insert: 2 + num_records_insert: 300 + repeat_count: 1 + num_records_upsert: 100 + num_partitions_upsert: 1 + type: UpsertNode + deps: first_validate + first_delete: + config: + num_partitions_delete: 50 + num_records_delete: 8000 + type: DeleteNode + deps: first_upsert + second_hive_sync: + config: + queue_name: "adhoc" + engine: "mr" + type: HiveSyncNode + deps: first_delete + second_validate: + config: + validate_hive: true + type: ValidateDatasetNode + deps: second_hive_sync diff --git a/docker/generate_test_suite.sh b/docker/generate_test_suite.sh index 60655a7a5169d..d7c1405630f0a 100755 --- a/docker/generate_test_suite.sh +++ b/docker/generate_test_suite.sh @@ -16,13 +16,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -NUM_ITR=1 +MEDIUM_NUM_ITR=20 +LONG_NUM_ITR=50 DELAY_MINS=1 TABLE_TYPE=COPY_ON_WRITE +INCLUDE_LONG_TEST_SUITE=false +INCLUDE_MEDIUM_TEST_SUITE=false INCLUDE_CLUSTER_YAML=false -CLUSTER_NUM_ITR=2 +CLUSTER_NUM_ITR=30 CLUSTER_DELAY_MINS=1 -CLUSTER_ITR_COUNT=1 +CLUSTER_ITR_COUNT=15 +EXECUTE_TEST_SUITE=true JAR_NAME=hudi-integ-test-bundle-0.9.0-SNAPSHOT.jar INPUT_PATH="/user/hive/warehouse/hudi-integ-test-suite/input/" OUTPUT_PATH="/user/hive/warehouse/hudi-integ-test-suite/output/" @@ -35,8 +39,18 @@ do key="$1" case $key in - --num_iterations) - NUM_ITR="$2" + --execute_test_suite) + EXECUTE_TEST_SUITE="$2" + shift # past argument + shift # past value + ;; + --medium_num_iterations) + MEDIUM_NUM_ITR="$2" + shift # past argument + shift # past value + ;; + --long_num_iterations) + LONG_NUM_ITR="$2" shift # past argument shift # past value ;; @@ -50,6 +64,16 @@ case $key in shift # past argument shift # past value ;; + --include_long_test_suite_yaml) + INCLUDE_LONG_TEST_SUITE="$2" + shift # past argument + shift # past value + ;; + --include_medium_test_suite_yaml) + INCLUDE_MEDIUM_TEST_SUITE="$2" + shift # past argument + shift # past value + ;; --include_cluster_yaml) INCLUDE_CLUSTER_YAML="$2" shift # past argument @@ -97,13 +121,23 @@ esac done set -- "${POSITIONAL[@]}" # restore positional parameters -echo "Num Iterations = ${NUM_ITR}" +echo "Include Medium test suite $INCLUDE_MEDIUM_TEST_SUITE" +if $INCLUDE_MEDIUM_TEST_SUITE ; then + echo "Medium test suite iterations = ${MEDIUM_NUM_ITR}" +fi +echo "Include Long test suite $INCLUDE_LONG_TEST_SUITE" +if $INCLUDE_LONG_TEST_SUITE ; then + echo "Long test suite iterations = ${LONG_NUM_ITR}" +fi echo "Intermittent delay in mins = ${DELAY_MINS}" echo "Table type = ${TABLE_TYPE}" + echo "Include cluster yaml $INCLUDE_CLUSTER_YAML" -echo "Cluster total itr count $CLUSTER_NUM_ITR" -echo "Cluster delay mins $CLUSTER_DELAY_MINS" -echo "Cluster exec itr count $CLUSTER_ITR_COUNT" +if $INCLUDE_CLUSTER_YAML ; then + echo "Cluster total itr count $CLUSTER_NUM_ITR" + echo "Cluster delay mins $CLUSTER_DELAY_MINS" + echo "Cluster exec itr count $CLUSTER_ITR_COUNT" +fi echo "Jar name $JAR_NAME" INPUT_PATH=$(echo "$INPUT_PATH" | sed "s|\/|\\\/|g") echo "Input path $INPUT_PATH" @@ -125,22 +159,56 @@ if [ ! -d "demo/config/test-suite/staging" ]; then mkdir demo/config/test-suite/staging fi -cp demo/config/test-suite/templates/long-running.yaml.template demo/config/test-suite/staging/long-running.yaml +cp demo/config/test-suite/templates/sanity.yaml.template demo/config/test-suite/staging/sanity.yaml -sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/long-running.yaml -sed -i '' "s/num_iterations/$NUM_ITR/" demo/config/test-suite/staging/long-running.yaml -sed -i '' "s/delay_in_mins/$DELAY_MINS/" demo/config/test-suite/staging/long-running.yaml +sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/sanity.yaml cp demo/config/test-suite/templates/test.properties.template demo/config/test-suite/staging/test.properties sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/test.properties -cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/long_running_spark_command.sh +cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/sanity_spark_command.sh + +sed -i '' "s/JAR_NAME/$JAR_NAME/" demo/config/test-suite/staging/sanity_spark_command.sh +sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/sanity_spark_command.sh +sed -i '' "s/OUTPUT_PATH/$OUTPUT_PATH/" demo/config/test-suite/staging/sanity_spark_command.sh +sed -i '' "s/input_yaml/sanity.yaml/" demo/config/test-suite/staging/sanity_spark_command.sh +sed -i '' "s/TABLE_TYPE/$TABLE_TYPE/" demo/config/test-suite/staging/sanity_spark_command.sh -sed -i '' "s/JAR_NAME/$JAR_NAME/" demo/config/test-suite/staging/long_running_spark_command.sh -sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/long_running_spark_command.sh -sed -i '' "s/OUTPUT_PATH/$OUTPUT_PATH/" demo/config/test-suite/staging/long_running_spark_command.sh -sed -i '' "s/input_yaml/long-running.yaml/" demo/config/test-suite/staging/long_running_spark_command.sh -sed -i '' "s/TABLE_TYPE/$TABLE_TYPE/" demo/config/test-suite/staging/long_running_spark_command.sh +if $INCLUDE_MEDIUM_TEST_SUITE ; then + + cp demo/config/test-suite/templates/medium_test_suite.yaml.template demo/config/test-suite/staging/medium_test_suite.yaml + + sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/medium_test_suite.yaml + sed -i '' "s/medium_num_iterations/$MEDIUM_NUM_ITR/" demo/config/test-suite/staging/medium_test_suite.yaml + sed -i '' "s/delay_in_mins/$DELAY_MINS/" demo/config/test-suite/staging/medium_test_suite.yaml + + cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/medium_test_suite_spark_command.sh + + sed -i '' "s/JAR_NAME/$JAR_NAME/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + sed -i '' "s/OUTPUT_PATH/$OUTPUT_PATH/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + sed -i '' "s/input_yaml/medium_test_suite.yaml/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + sed -i '' "s/TABLE_TYPE/$TABLE_TYPE/" demo/config/test-suite/staging/medium_test_suite_spark_command.sh + +fi + +if $INCLUDE_LONG_TEST_SUITE ; then + + cp demo/config/test-suite/templates/long_test_suite.yaml.template demo/config/test-suite/staging/long_test_suite.yaml + + sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/long_test_suite.yaml + sed -i '' "s/long_num_iterations/$LONG_NUM_ITR/" demo/config/test-suite/staging/long_test_suite.yaml + sed -i '' "s/delay_in_mins/$DELAY_MINS/" demo/config/test-suite/staging/long_test_suite.yaml + + cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/long_test_suite_spark_command.sh + + sed -i '' "s/JAR_NAME/$JAR_NAME/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + sed -i '' "s/INPUT_PATH/$INPUT_PATH/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + sed -i '' "s/OUTPUT_PATH/$OUTPUT_PATH/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + sed -i '' "s/input_yaml/long_test_suite.yaml/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + sed -i '' "s/TABLE_TYPE/$TABLE_TYPE/" demo/config/test-suite/staging/long_test_suite_spark_command.sh + +fi if $INCLUDE_CLUSTER_YAML ; then @@ -148,7 +216,7 @@ if $INCLUDE_CLUSTER_YAML ; then sed -i '' "s/NAME/$TABLE_TYPE/" demo/config/test-suite/staging/clustering.yaml sed -i '' "s/clustering_num_iterations/$CLUSTER_NUM_ITR/" demo/config/test-suite/staging/clustering.yaml - sed -i '' "s/clustering_delay_in_mins/$CLUSTER_DELAY_MINS/" demo/config/test-suite/staging/clustering.yaml + sed -i '' "s/delay_in_mins/$CLUSTER_DELAY_MINS/" demo/config/test-suite/staging/clustering.yaml sed -i '' "s/clustering_itr_count/$CLUSTER_ITR_COUNT/" demo/config/test-suite/staging/clustering.yaml cp demo/config/test-suite/templates/spark_command.txt.template demo/config/test-suite/staging/clustering_spark_command.sh @@ -162,11 +230,27 @@ if $INCLUDE_CLUSTER_YAML ; then fi -docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME adhoc-2:/opt/ -docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging* -docker cp demo/config/test-suite/staging/ adhoc-2:/opt/ -docker exec -it adhoc-2 /bin/bash /opt/staging/long_running_spark_command.sh +if $EXECUTE_TEST_SUITE ; then + + docker cp $CUR_DIR/../packaging/hudi-integ-test-bundle/target/$JAR_NAME adhoc-2:/opt/ + docker exec -it adhoc-2 /bin/bash rm -rf /opt/staging* + docker cp demo/config/test-suite/staging/ adhoc-2:/opt/ + docker exec -it adhoc-2 /bin/bash echo "\n============================== Executing sanity test suite ============================== " + docker exec -it adhoc-2 /bin/bash /opt/staging/sanity_spark_command.sh + + if [ -f demo/config/test-suite/staging/medium_test_suite_spark_command.sh ]; then + docker exec -it adhoc-2 /bin/bash echo "\n\n\n============================== Executing medium test suite ============================== " + docker exec -it adhoc-2 /bin/bash /opt/staging/medium_test_suite_spark_command.sh + fi + + if [ -f demo/config/test-suite/staging/long_test_suite_spark_command.sh ]; then + docker exec -it adhoc-2 /bin/bash echo "\n\n\n============================== Executing long test suite ============================== " + docker exec -it adhoc-2 /bin/bash /opt/staging/long_test_suite_spark_command.sh + fi + + if [ -f demo/config/test-suite/staging/clustering_spark_command.sh ]; then + docker exec -it adhoc-2 /bin/bash echo "\n\n\n============================== Executing clustering test suite ============================== " + docker exec -it adhoc-2 /bin/bash /opt/staging/clustering_spark_command.sh + fi -if [ -f demo/config/test-suite/staging/clustering_spark_command.sh ]; then - docker exec -it adhoc-2 /bin/bash /opt/staging/clustering_spark_command.sh fi diff --git a/hudi-integ-test/README.md b/hudi-integ-test/README.md index 26db278db03f6..68db715a3dd60 100644 --- a/hudi-integ-test/README.md +++ b/hudi-integ-test/README.md @@ -488,3 +488,33 @@ Spark submit with the flag: --saferSchemaEvolution ``` +## Automated tests for N no of yamls in Local Docker environment + +Hudi provides a script to assist you in testing N no of yamls automatically. Checkout the script under +hudi_root/docker folder. +generate_test_suite.sh + +Example command : // execute the command from within docker folder. +./generate_test_suite.sh --execute_test_suite false --include_medium_test_suite_yaml true --include_long_test_suite_yaml true + +By default, generate_test_suite will run sanity test. In addition it supports 3 more yamls. +medium_test_suite, long_test_suite and clustering_test_suite. Users can add the required yamls via command line as per thier +necessity. + +Also, "--execute_test_suite" false will generate all required files and yamls in a local staging directory if users want to inspect them. +To go ahead and execute the same, you can give "--execute_test_suite true". +staging dir: docker/demo/config/test-suite/staging + +Also, there are other additional configs which users can override depending on their needs. +Some of the options are + +--table_type COPY_ON_WRITE/MERGE_ON_READ // refers to table type. +--medium_num_iterations 20 // refers to total iterations medium test suite should run. +--long_num_iterations 100 // refers to total iterations long test suite should run. +--intermittent_delay_mins 1 // refers to delay between successive runs within a single test suite job. +--cluster_num_itr 30 // refers to total iterations for clustering test suite. +--cluster_delay_mins 2 // refers to delay between successive runs for clustering test suite job. +--cluster_exec_itr_count 15 // refers to the iteration at which clustering needs to be triggered. + + +