diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index a4a7e10abc004..0ae72f94b82e0 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -605,8 +605,6 @@ private Pair, JavaRDD> writeToSink(JavaRDD 0; - long hiveSyncTimeMs = 0; - long metaSyncTimeMs = 0; if (!hasErrors || cfg.commitOnErrors) { HashMap checkpointCommitMetadata = new HashMap<>(); if (checkpointStr != null) { diff --git a/packaging/hudi-utilities-slim-bundle/README.md b/packaging/hudi-utilities-slim-bundle/README.md index 58353c403d325..60ee739153fdd 100644 --- a/packaging/hudi-utilities-slim-bundle/README.md +++ b/packaging/hudi-utilities-slim-bundle/README.md @@ -17,6 +17,89 @@ # Usage of hudi-utilities-slim-bundle -Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. -This new bundle is intended to be used with Hudi Spark bundle together, if using hudi-utilities-bundle solely -introduces problems for a specific Spark version. \ No newline at end of file +Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. This new bundle is intended to be used with Hudi Spark bundle together, if using +hudi-utilities-bundle solely introduces problems for a specific Spark version. + +## Example with Spark 2.4.7 + +* Build Hudi: `mvn clean install -DskipTests` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.11:2.4.7 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.11-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark24/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` + +## Example with Spark 3.1.2 + +* Build Hudi: `mvn clean install -DskipTests -Dspark3.1 -Dscala-2.12` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.12:3.1.2 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.1-bundle_2.12-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark31/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` + +## Example with Spark 3.2.0 + +* Build Hudi: `mvn clean install -DskipTests -Dspark3.2 -Dscala-2.12` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.12:3.2.0 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark32/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` diff --git a/packaging/hudi-utilities-slim-bundle/pom.xml b/packaging/hudi-utilities-slim-bundle/pom.xml index 60f0af9d64f07..993e2ad7fd912 100644 --- a/packaging/hudi-utilities-slim-bundle/pom.xml +++ b/packaging/hudi-utilities-slim-bundle/pom.xml @@ -77,7 +77,7 @@ - true + true META-INF/LICENSE @@ -92,10 +92,7 @@ org.apache.hudi:hudi-common org.apache.hudi:hudi-client-common - org.apache.hudi:hudi-spark-client org.apache.hudi:hudi-utilities_${scala.binary.version} - org.apache.hudi:hudi-hive-sync - org.apache.hudi:hudi-sync-common org.apache.hudi:hudi-hadoop-mr org.apache.hudi:hudi-timeline-service org.apache.hudi:hudi-aws @@ -136,13 +133,6 @@ org.apache.kafka:kafka_${scala.binary.version} com.101tec:zkclient org.apache.kafka:kafka-clients - - org.apache.hive:hive-common - org.apache.hive:hive-service - org.apache.hive:hive-service-rpc - org.apache.hive:hive-metastore - org.apache.hive:hive-jdbc - org.apache.hbase:hbase-client org.apache.hbase:hbase-common org.apache.hbase:hbase-hadoop-compat @@ -178,10 +168,6 @@ com.beust.jcommander. org.apache.hudi.com.beust.jcommander. - - org.apache.hive.jdbc. - ${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc. - org.apache.commons.io. org.apache.hudi.org.apache.commons.io. @@ -205,10 +191,6 @@ org.apache.hadoop.hive.metastore. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore. - - org.apache.hive.common. - ${utilities.bundle.hive.shade.prefix}org.apache.hive.common. - org.apache.hadoop.hive.common. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.common. @@ -217,10 +199,6 @@ org.apache.hadoop.hive.conf. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.conf. - - org.apache.hive.service. - ${utilities.bundle.hive.shade.prefix}org.apache.hive.service. - org.apache.hadoop.hive.service. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.service. @@ -344,116 +322,27 @@ org.apache.hudi - hudi-client-common - ${project.version} - - - org.apache.hudi - hudi-spark-client - ${project.version} - - - org.apache.hudi - hudi-hive-sync + hudi-utilities_${scala.binary.version} ${project.version} - javax.servlet - servlet-api + org.apache.hudi + hudi-spark-common_${scala.binary.version} + + + org.apache.hudi + hudi-spark_${scala.binary.version} + + + org.apache.hudi + ${hudi.spark.module}_${scala.binary.version} + + + org.apache.hudi + ${hudi.spark.common.module} - - org.apache.hudi - hudi-spark-common_${scala.binary.version} - ${project.version} - provided - - - org.apache.hudi - hudi-spark_${scala.binary.version} - ${project.version} - provided - - - org.apache.hudi - ${hudi.spark.module}_${scala.binary.version} - ${project.version} - provided - - - org.apache.hudi - ${hudi.spark.common.module} - ${project.version} - provided - - - org.apache.hudi - hudi-utilities_${scala.binary.version} - ${project.version} - - - - - ${hive.groupid} - hive-service - ${hive.version} - ${utilities.bundle.hive.scope} - - - - ${hive.groupid} - hive-service-rpc - ${hive.version} - ${utilities.bundle.hive.scope} - - - - ${hive.groupid} - hive-jdbc - ${hive.version} - ${utilities.bundle.hive.scope} - - - - ${hive.groupid} - hive-metastore - ${hive.version} - ${utilities.bundle.hive.scope} - - - - ${hive.groupid} - hive-common - ${hive.version} - ${utilities.bundle.hive.scope} - - - - org.apache.htrace - htrace-core - ${htrace.version} - compile - - - - - org.apache.curator - curator-framework - ${zk-curator.version} - - - - org.apache.curator - curator-client - ${zk-curator.version} - - - - org.apache.curator - curator-recipes - ${zk-curator.version} - diff --git a/pom.xml b/pom.xml index d898d34d35e43..1188ec620aa39 100644 --- a/pom.xml +++ b/pom.xml @@ -99,6 +99,7 @@ 2.8.1 5.3.4 2.17 + 3.0.1-b12 1.10.1 5.7.0-M1 5.7.0-M1 @@ -556,6 +557,12 @@ jersey-container-servlet-core ${glassfish.version} + + org.glassfish + javax.el + ${glassfish.el.version} + provided +